From f5a52e1600e59d461a11abbf05585c0e0b19c906 Mon Sep 17 00:00:00 2001
From: HangXu <hangxu0304@gmail.com>
Date: Mon, 1 Jul 2024 13:44:21 +0800
Subject: [PATCH 001/167] fp8 operators for compressed communication

cast_to_fp8, cast_from_fp8, all_reduce_fp8
---
 colossalai/quantization/fp8.py | 105 +++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 colossalai/quantization/fp8.py

diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
new file mode 100644
index 000000000000..d405de2de3d1
--- /dev/null
+++ b/colossalai/quantization/fp8.py
@@ -0,0 +1,105 @@
+import torch
+import torch.distributed as dist
+
+
+def cast_to_fp8(inp: torch.Tensor, scale=None, fp8_format="e4m3") -> (torch.Tensor, torch.Tensor):
+    r"""
+    casting torch Tensor into specified fp8 tensor with per-channel scaling or per-tensor scaling.
+    Args:
+        inp: input torch Tensor, should be in torch.FloatTensor, torch.HalfTensor, torch.BFloat16Tensor.
+        scale: scaling factor for fp8 casting. If it is None, then it is computed automatically. Per-channel scaling
+        is applied if input tensor is 2 dimension, otherwise, per-tensor scaling is applied.
+        fp8_format: e4m3 or e5m2
+
+    Returns:
+        Tuples: A tuple (fp8_tensor, scale)
+    """
+    if inp.dtype in [torch.FloatTensor, torch.HalfTensor, torch.BFloat16Tensor]:
+        return inp
+    fp8_type = torch.float8_e4m3fn if fp8_format == "e4m3" else torch.float8_e5m2
+
+    if inp.dim() == 2:
+        if scale is None:
+            per_channel_max = inp.abs().max(dim=-1).values
+            scale = per_channel_max
+        scale_inv = 1.0 / scale
+        scale_inv = scale_inv[:, None]
+        ret = (scale_inv * inp).to(fp8_type)
+    else:
+        if scale is None:
+            per_tensor_max = inp.abs().max()
+            scale = per_tensor_max
+        scale_inv = 1.0 / scale
+        ret = (scale_inv * inp).to(fp8_type)
+
+    return ret, scale
+
+
+def cast_from_fp8(inp: torch.Tensor, scale: torch.Tensor, ret_type: torch.dtype) -> torch.Tensor:
+    r"""
+
+    Args:
+        inp: should be a fp8 torch tensor in one of the types: [torch.float8_e4m3fn, torch.float8_e5m2].
+        scale: scaling factor returned by cast_to_fp8 function.
+        ret_type: the datatype of the returned tensor.
+
+    Returns:
+        torch.Tensor
+    """
+    if inp.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
+        return inp
+    if inp.dim() == 2:
+        ret = scale[:, None] * inp.to(ret_type)
+    else:
+        ret = scale * inp.to(ret_type)
+    return ret
+
+
+def all_reduce_fp8(tensor: torch.Tensor, fp8_format="e4m3") -> None:
+    r"""
+    This is an in-place operation for compressed all_reduce using fp8.
+    It works like dist.all_reduce but during communication the data is cast to fp8 format.
+
+    Args:
+        tensor: torch.Tensor in fp32, fp16, bf16 datatype.
+        fp8_format: e4m3 or e5m2
+
+    Returns:
+        None
+    """
+
+    world_size = dist.get_world_size()
+    rank = dist.get_rank()
+    input_type = tensor.dtype
+    input_shape = tensor.shape
+    input_device = tensor.device
+    input_size = tensor.numel()
+    tensor = tensor.flatten()
+
+    fp8_type = torch.float8_e4m3fn if fp8_format == "e4m3" else torch.float8_e5m2
+
+    ret, scale = cast_to_fp8(tensor, fp8_format=fp8_format)
+
+    inp = ret.view(torch.uint8)
+    input_chunks = list(torch.chunk(inp, world_size, dim=0))
+    if dist.get_rank() == world_size - 1:
+        output_chunks = [torch.empty_like(input_chunks[-1]) for _ in range(world_size)]
+    else:
+        output_chunks = [torch.empty_like(input_chunks[0]) for _ in range(world_size)]
+    dist.all_to_all(output_chunks, input_chunks)
+    scale_list = [torch.ones(1, dtype=scale.dtype, device=input_device) for _ in range(world_size)]
+    dist.all_gather(scale_list, scale)
+    summed_out = torch.zeros_like(output_chunks[0]).to(input_type)
+    for scale, out in zip(scale_list, output_chunks):
+        out = out.view(fp8_type)
+        summed_out += cast_from_fp8(out, scale, input_type)
+
+    summed_out_fp8, scale = cast_to_fp8(summed_out, fp8_format=fp8_format)
+    dist.all_gather(scale_list, scale)
+
+    tensor_list = list(torch.chunk(torch.empty(input_size, device=input_device, dtype=torch.uint8), world_size, dim=0))
+    dist.all_gather(tensor_list, summed_out_fp8.view(torch.uint8))
+    for i in range(world_size):
+        tensor_list[i] = tensor_list[i].view(fp8_type).to(input_type) * scale_list[i]
+    tensor_out = torch.cat(tensor_list, dim=0)
+    tensor.data = tensor_out.view(input_shape).to(input_type)
\ No newline at end of file

From e17f835df7c637e18df708b929b570c2ac459434 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 4 Jul 2024 12:47:16 +0000
Subject: [PATCH 002/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 colossalai/quantization/fp8.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index d405de2de3d1..c880cd4aa44b 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -69,7 +69,7 @@ def all_reduce_fp8(tensor: torch.Tensor, fp8_format="e4m3") -> None:
     """
 
     world_size = dist.get_world_size()
-    rank = dist.get_rank()
+    dist.get_rank()
     input_type = tensor.dtype
     input_shape = tensor.shape
     input_device = tensor.device
@@ -102,4 +102,4 @@ def all_reduce_fp8(tensor: torch.Tensor, fp8_format="e4m3") -> None:
     for i in range(world_size):
         tensor_list[i] = tensor_list[i].view(fp8_type).to(input_type) * scale_list[i]
     tensor_out = torch.cat(tensor_list, dim=0)
-    tensor.data = tensor_out.view(input_shape).to(input_type)
\ No newline at end of file
+    tensor.data = tensor_out.view(input_shape).to(input_type)

From dbfa7d39fc06534cf3d44ba8d1a5ae4d147d7133 Mon Sep 17 00:00:00 2001
From: GuangyaoZhang <xjtu521@qq.com>
Date: Wed, 10 Jul 2024 08:13:26 +0000
Subject: [PATCH 003/167] fix typo

---
 colossalai/quantization/fp8.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index c880cd4aa44b..58cedbc9554f 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -14,7 +14,7 @@ def cast_to_fp8(inp: torch.Tensor, scale=None, fp8_format="e4m3") -> (torch.Tens
     Returns:
         Tuples: A tuple (fp8_tensor, scale)
     """
-    if inp.dtype in [torch.FloatTensor, torch.HalfTensor, torch.BFloat16Tensor]:
+    if inp.dtype not in [torch.float32, torch.float16, torch.bfloat16]:
         return inp
     fp8_type = torch.float8_e4m3fn if fp8_format == "e4m3" else torch.float8_e5m2
 

From 1e1959467e4cbc0c0bc15aff4c19c969e76c9a72 Mon Sep 17 00:00:00 2001
From: BurkeHulk <hangxu0304@gmail.com>
Date: Fri, 12 Jul 2024 15:23:37 +0800
Subject: [PATCH 004/167] fix scaling algorithm in FP8 casting

---
 colossalai/quantization/fp8.py | 44 ++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index d405de2de3d1..051ecb45a8fc 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -1,8 +1,10 @@
+from typing import Any, Callable, List, Optional, Tuple, Union
+
 import torch
 import torch.distributed as dist
 
 
-def cast_to_fp8(inp: torch.Tensor, scale=None, fp8_format="e4m3") -> (torch.Tensor, torch.Tensor):
+def cast_to_fp8(inp: torch.Tensor, fp8_format="e4m3") -> (torch.Tensor, torch.Tensor):
     r"""
     casting torch Tensor into specified fp8 tensor with per-channel scaling or per-tensor scaling.
     Args:
@@ -14,28 +16,28 @@ def cast_to_fp8(inp: torch.Tensor, scale=None, fp8_format="e4m3") -> (torch.Tens
     Returns:
         Tuples: A tuple (fp8_tensor, scale)
     """
-    if inp.dtype in [torch.FloatTensor, torch.HalfTensor, torch.BFloat16Tensor]:
-        return inp
+
+    if inp.dtype not in [torch.float32, torch.float16, torch.bfloat16]:
+        raise TypeError("Only float16, bfloat16, and float32 are allowed.")
+
     fp8_type = torch.float8_e4m3fn if fp8_format == "e4m3" else torch.float8_e5m2
+    fp8_max = torch.finfo(fp8_type).max
 
     if inp.dim() == 2:
-        if scale is None:
-            per_channel_max = inp.abs().max(dim=-1).values
-            scale = per_channel_max
-        scale_inv = 1.0 / scale
-        scale_inv = scale_inv[:, None]
-        ret = (scale_inv * inp).to(fp8_type)
+        per_channel_max = inp.abs().max(dim=-1).values.float()
+        per_channel_max = torch.where(per_channel_max > 0, per_channel_max, 1.0)
+        scale = fp8_max / per_channel_max[:, None]
     else:
-        if scale is None:
-            per_tensor_max = inp.abs().max()
-            scale = per_tensor_max
-        scale_inv = 1.0 / scale
-        ret = (scale_inv * inp).to(fp8_type)
+        per_tensor_max = inp.abs().max().float()
+        per_tensor_max = torch.where(per_tensor_max > 0, per_tensor_max, 1.0)
+        scale = fp8_max / per_tensor_max
 
-    return ret, scale
+    scale_inv = 1.0 / scale
+    ret = (scale * inp.float()).to(fp8_type)
+    return ret, scale_inv
 
 
-def cast_from_fp8(inp: torch.Tensor, scale: torch.Tensor, ret_type: torch.dtype) -> torch.Tensor:
+def cast_from_fp8(inp: torch.Tensor, scale_inv: torch.Tensor, ret_type: torch.dtype) -> torch.Tensor:
     r"""
 
     Args:
@@ -47,12 +49,13 @@ def cast_from_fp8(inp: torch.Tensor, scale: torch.Tensor, ret_type: torch.dtype)
         torch.Tensor
     """
     if inp.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
-        return inp
+        raise TypeError("Only float8_e4m3fn and float8_e5m2 are allowed.")
+
     if inp.dim() == 2:
-        ret = scale[:, None] * inp.to(ret_type)
+        ret = scale_inv[:, None] * inp.float()
     else:
-        ret = scale * inp.to(ret_type)
-    return ret
+        ret = scale_inv * inp.float()
+    return ret.to(ret_type)
 
 
 def all_reduce_fp8(tensor: torch.Tensor, fp8_format="e4m3") -> None:
@@ -69,7 +72,6 @@ def all_reduce_fp8(tensor: torch.Tensor, fp8_format="e4m3") -> None:
     """
 
     world_size = dist.get_world_size()
-    rank = dist.get_rank()
     input_type = tensor.dtype
     input_shape = tensor.shape
     input_device = tensor.device

From e88190184aea4e67fe5432d0501314287fedec62 Mon Sep 17 00:00:00 2001
From: BurkeHulk <hangxu0304@gmail.com>
Date: Fri, 12 Jul 2024 15:25:25 +0800
Subject: [PATCH 005/167] support fp8 communication in pipeline parallelism

---
 .../booster/plugin/hybrid_parallel_plugin.py  |  3 +
 .../pipeline/schedule/interleaved_pp.py       | 29 ++++++++
 colossalai/pipeline/schedule/one_f_one_b.py   | 26 +++++++
 colossalai/quantization/fp8.py                | 69 ++++++++++++++++++-
 4 files changed, 126 insertions(+), 1 deletion(-)

diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index a3d6f1e74771..b818209a6668 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -992,6 +992,7 @@ def __init__(
         make_vocab_size_divisible_by: int = 64,
         dp_outside: bool = True,
         overlap_p2p: bool = True,
+        fp8_communication: bool = False,
     ) -> None:
         super().__init__()
         assert (
@@ -1082,6 +1083,7 @@ def __init__(
                     microbatch_size=microbatch_size,
                     enable_metadata_cache=enable_metadata_cache,
                     overlap_p2p=overlap_p2p,
+                    fp8_communication=fp8_communication,
                 )
             elif pp_style == "1f1b":
                 self.schedule = OneForwardOneBackwardSchedule(
@@ -1089,6 +1091,7 @@ def __init__(
                     num_microbatches=num_microbatches,
                     microbatch_size=microbatch_size,
                     enable_metadata_cache=enable_metadata_cache,
+                    fp8_communication=fp8_communication,
                 )
             else:
                 raise NotImplementedError()
diff --git a/colossalai/pipeline/schedule/interleaved_pp.py b/colossalai/pipeline/schedule/interleaved_pp.py
index a21b45c44a2c..86ce536d0a3c 100644
--- a/colossalai/pipeline/schedule/interleaved_pp.py
+++ b/colossalai/pipeline/schedule/interleaved_pp.py
@@ -12,6 +12,7 @@
 from colossalai.pipeline.p2p import PipelineP2PCommunication, create_send_metadata
 from colossalai.pipeline.stage_manager import PipelineStageManager
 from colossalai.utils import get_current_device
+from colossalai.quantization.fp8 import cast_to_fp8_pipeline, cast_from_fp8_pipeline
 
 from ._utils import detach, get_batch_size, get_micro_batch, merge_batch, model_forward, retain_grad, to_device
 from .base import PipelineSchedule
@@ -32,6 +33,7 @@ def __init__(
         microbatch_size: Optional[int] = None,
         enable_metadata_cache: bool = True,
         overlap_p2p: bool = True,
+        fp8_communication: bool = False,
     ) -> None:
         super().__init__(stage_manager)
         assert (
@@ -56,6 +58,7 @@ def __init__(
         self.tensor_metadata_recv = None
         self.grad_metadata_recv = None
 
+        self.fp8_communication = fp8_communication
     def load_batch(self, data_iter: Iterable, device: Optional[torch.device] = None) -> None:
         """Load a batch from data iterator.
 
@@ -191,8 +194,12 @@ def send_forward(self, model_chunk_id: int, output_tensor: Any, next_rank: int =
         """
         with self.stage_manager.switch_model_chunk_id(model_chunk_id):
             if not self.stage_manager.is_last_stage():
+                if self.fp8_communication:
+                    cast_to_fp8_pipeline(output_tensor)
                 send_handles = self.comm.send_forward(output_tensor, next_rank, send_metadata=self.send_tensor_metadata)
                 self.send_tensor_metadata = not self.enable_metadata_cache
+                if self.fp8_communication:
+                    cast_from_fp8_pipeline(output_tensor)
                 return send_handles
         return []
 
@@ -210,10 +217,14 @@ def send_backward(self, model_chunk_id: int, input_tensor_grad: Any, prev_rank:
         """
         with self.stage_manager.switch_model_chunk_id(model_chunk_id):
             if not self.stage_manager.is_first_stage():
+                if self.fp8_communication:
+                    cast_to_fp8_pipeline(input_tensor_grad)
                 send_handles = self.comm.send_backward(
                     input_tensor_grad, prev_rank, send_metadata=self.send_grad_metadata
                 )
                 self.send_grad_metadata = not self.enable_metadata_cache
+                if self.fp8_communication:
+                    cast_from_fp8_pipeline(input_tensor_grad)
                 return send_handles
         return []
 
@@ -224,6 +235,8 @@ def send_forward_recv_forward(
             is_send = not self.stage_manager.is_last_stage()
         with self.stage_manager.switch_model_chunk_id(model_chunk_id_recv):
             is_recv = not self.stage_manager.is_first_stage()
+        if self.fp8_communication:
+            cast_to_fp8_pipeline(output_tensor)
         input_tensor, wait_handles = self.comm.send_forward_recv_forward(
             output_tensor,
             is_send,
@@ -237,6 +250,8 @@ def send_forward_recv_forward(
         if is_recv and self.enable_metadata_cache and self.tensor_metadata_recv is None:
             self.tensor_metadata_recv = create_send_metadata(input_tensor)
 
+        if self.fp8_communication:
+            cast_from_fp8_pipeline(output_tensor)
         return input_tensor, wait_handles
 
     def send_backward_recv_backward(
@@ -246,6 +261,8 @@ def send_backward_recv_backward(
             is_send = not self.stage_manager.is_first_stage()
         with self.stage_manager.switch_model_chunk_id(model_chunk_id_recv):
             is_recv = not self.stage_manager.is_last_stage()
+        if self.fp8_communication:
+            cast_to_fp8_pipeline(input_tensor_grad)
         output_tensor_grad, wait_handles = self.comm.send_backward_recv_backward(
             input_tensor_grad,
             is_send,
@@ -258,6 +275,8 @@ def send_backward_recv_backward(
         self.send_grad_metadata = not self.enable_metadata_cache and is_send
         if is_recv and self.enable_metadata_cache and self.grad_metadata_recv is None:
             self.grad_metadata_recv = create_send_metadata(output_tensor_grad)
+        if self.fp8_communication:
+            cast_from_fp8_pipeline(input_tensor_grad)
         return output_tensor_grad, wait_handles
 
     def forward_step(
@@ -379,6 +398,8 @@ def run_forward_only(
 
             # Wait until current input is received
             _wait_p2p(fwd_wait_handles)
+            if self.fp8_communication and input_obj is not None:
+                cast_from_fp8_pipeline(input_obj)
             output_obj = self.forward_step(model_chunk, model_chunk_id, input_obj, criterion, accum_loss, outputs)
 
             if not last_batch:
@@ -441,6 +462,8 @@ def run_forward_backward(
 
             # Wait for input
             _wait_p2p(fwd_wait_handles)
+            if self.fp8_communication and input_obj is not None:
+                cast_from_fp8_pipeline(input_obj)
             output_obj = self.forward_step(model_chunk, model_chunk_id, input_obj, criterion, accum_loss, outputs)
             input_objs[model_chunk_id].append(input_obj)
             output_objs[model_chunk_id].append(output_obj)
@@ -467,6 +490,8 @@ def run_forward_backward(
 
             # Wait for input.
             _wait_p2p(fwd_wait_handles)
+            if self.fp8_communication and input_obj is not None:
+                cast_from_fp8_pipeline(input_obj)
             output_obj = self.forward_step(model_chunk, model_chunk_id, input_obj, criterion, accum_loss, outputs)
             # Add input_obj and output_obj to end of list.
             input_objs[model_chunk_id].append(input_obj)
@@ -511,6 +536,8 @@ def send_backward_recv_backward():
             input_obj, fwd_wait_handles = send_forward_recv_forward()
             # Wait for upstream grad
             _wait_p2p(bwd_wait_handles)
+            if self.fp8_communication and output_obj_grad is not None:
+                cast_from_fp8_pipeline(output_obj_grad)
             input_obj_grad = self.backward_step(optimizer, _input_obj, _output_obj, output_obj_grad)
             # NOTE: It's documented by NCCL that running two concurrent communicators (batch_isend_irecv)
             # risks deadlock (https://docs.nvidia.com/deeplearning/nccl/archives/nccl_2134/user-guide/docs/usage/communicators.html)
@@ -532,6 +559,8 @@ def send_backward_recv_backward():
 
             # Wait for upstream grad
             _wait_p2p(bwd_wait_handles)
+            if self.fp8_communication and output_obj_grad is not None:
+                cast_from_fp8_pipeline(output_obj_grad)
             # backward local grads
             input_obj_grad = self.backward_step(optimizer, _input_obj, _output_obj, output_obj_grad)
             if not last_batch:
diff --git a/colossalai/pipeline/schedule/one_f_one_b.py b/colossalai/pipeline/schedule/one_f_one_b.py
index 7f0d0e3493f7..90ebb0534497 100644
--- a/colossalai/pipeline/schedule/one_f_one_b.py
+++ b/colossalai/pipeline/schedule/one_f_one_b.py
@@ -11,6 +11,7 @@
 from colossalai.pipeline.p2p import PipelineP2PCommunication, create_send_metadata
 from colossalai.pipeline.stage_manager import PipelineStageManager
 from colossalai.utils import get_current_device
+from colossalai.quantization.fp8 import cast_to_fp8_pipeline, cast_from_fp8_pipeline
 
 from ._utils import (
     detach,
@@ -32,6 +33,7 @@ def __init__(
         num_microbatches: Optional[int] = None,
         microbatch_size: Optional[int] = None,
         enable_metadata_cache: bool = True,
+        fp8_communication: bool = False,
     ) -> None:
         """1F1B pipeline schedule.
 
@@ -61,6 +63,8 @@ def __init__(
         self.tensor_metadata_recv = None
         self.grad_metadata_recv = None
 
+        self.fp8_communication = fp8_communication
+
     def load_batch(self, data_iter: Iterable, device: Optional[torch.device] = None) -> None:
         """Load a batch from data iterator.
 
@@ -129,6 +133,8 @@ def recv_forward(self, prev_rank: int = None) -> Any:
             if self.enable_metadata_cache and self.tensor_metadata_recv is None:
                 self.tensor_metadata_recv = create_send_metadata(input_tensor)
 
+            if self.fp8_communication:
+                cast_from_fp8_pipeline(input_tensor)
             return input_tensor
 
     def recv_backward(self, next_rank: int = None) -> Any:
@@ -143,6 +149,8 @@ def recv_backward(self, next_rank: int = None) -> Any:
         """
         if not self.stage_manager.is_last_stage():
             output_tensor_grad, _ = self.comm.recv_backward(next_rank, metadata_recv=self.grad_metadata_recv)
+            if self.fp8_communication:
+                cast_from_fp8_pipeline(output_tensor_grad)
             if self.enable_metadata_cache and self.grad_metadata_recv is None:
                 self.grad_metadata_recv = create_send_metadata(output_tensor_grad)
 
@@ -157,9 +165,13 @@ def send_forward(self, output_tensor: Any, next_rank: int = None) -> None:
             next_rank (int, optional): The rank of the recipient of the tensor.
         """
         if not self.stage_manager.is_last_stage():
+            if self.fp8_communication:
+                cast_to_fp8_pipeline(output_tensor)
             self.comm.send_forward(output_tensor, next_rank, send_metadata=self.send_tensor_metadata)
             self.send_tensor_metadata = not self.enable_metadata_cache
 
+            if self.fp8_communication:
+                cast_from_fp8_pipeline(output_tensor, del_metadata=False)
     def send_backward(self, input_tensor_grad: Any, prev_rank: int = None) -> None:
         """Sends the gradient tensor to the previous stage in pipeline.
            For 1F1B.
@@ -169,8 +181,12 @@ def send_backward(self, input_tensor_grad: Any, prev_rank: int = None) -> None:
             prev_rank (int, optional): The rank of the recipient of the tensor
         """
         if not self.stage_manager.is_first_stage():
+            if self.fp8_communication:
+                cast_to_fp8_pipeline(input_tensor_grad)
             self.comm.send_backward(input_tensor_grad, prev_rank, send_metadata=self.send_grad_metadata)
             self.send_grad_metadata = not self.enable_metadata_cache
+            if self.fp8_communication:
+                cast_from_fp8_pipeline(input_tensor_grad, del_metadata=False)
 
     def send_forward_recv_backward(self, output_tensor: Any, send_first: Optional[bool] = None) -> Any:
         """Sends the input tensor to the next stage and copy the gradient tensor from the next stage in pipeline.
@@ -183,6 +199,8 @@ def send_forward_recv_backward(self, output_tensor: Any, send_first: Optional[bo
         if not self.stage_manager.is_last_stage():
             if not self.send_tensor_metadata and self.grad_metadata_recv is not None:
                 send_first = None
+            if self.fp8_communication:
+                cast_to_fp8_pipeline(output_tensor)
             output_tensor_grad, _ = self.comm.send_forward_recv_backward(
                 output_tensor,
                 send_metadata=self.send_tensor_metadata,
@@ -192,6 +210,9 @@ def send_forward_recv_backward(self, output_tensor: Any, send_first: Optional[bo
             self.send_tensor_metadata = not self.enable_metadata_cache
             if self.enable_metadata_cache and self.grad_metadata_recv is None:
                 self.grad_metadata_recv = create_send_metadata(output_tensor_grad)
+            if self.fp8_communication:
+                cast_from_fp8_pipeline(output_tensor, del_metadata=False)
+                cast_from_fp8_pipeline(output_tensor_grad)
 
             return output_tensor_grad
 
@@ -206,6 +227,8 @@ def send_backward_recv_forward(self, input_tensor_grad: Any, send_first: Optiona
         if not self.stage_manager.is_first_stage():
             if not self.send_grad_metadata and self.tensor_metadata_recv is not None:
                 send_first = None  # must not fallback
+            if self.fp8_communication:
+                cast_to_fp8_pipeline(input_tensor_grad)
             input_tensor, _ = self.comm.send_backward_recv_forward(
                 input_tensor_grad,
                 send_metadata=self.send_grad_metadata,
@@ -215,6 +238,9 @@ def send_backward_recv_forward(self, input_tensor_grad: Any, send_first: Optiona
             self.send_grad_metadata = not self.enable_metadata_cache
             if self.enable_metadata_cache and self.tensor_metadata_recv is None:
                 self.tensor_metadata_recv = create_send_metadata(input_tensor)
+            if self.fp8_communication:
+                cast_from_fp8_pipeline(input_tensor)
+                cast_from_fp8_pipeline(input_tensor_grad, del_metadata=False)
 
             return input_tensor
 
diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index 051ecb45a8fc..c02223331163 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -104,4 +104,71 @@ def all_reduce_fp8(tensor: torch.Tensor, fp8_format="e4m3") -> None:
     for i in range(world_size):
         tensor_list[i] = tensor_list[i].view(fp8_type).to(input_type) * scale_list[i]
     tensor_out = torch.cat(tensor_list, dim=0)
-    tensor.data = tensor_out.view(input_shape).to(input_type)
\ No newline at end of file
+    tensor.data = tensor_out.view(input_shape).to(input_type)
+
+
+
+def cast_to_fp8_pipeline(inp: Any) -> None:
+    """
+    Cast the hidden_states tensor of inp object to fp8 format before p2p communication in pipeline.
+    The activations tensor is indexed by 'hidden_states' in the inp dict.
+    After FP8 casting, the resulting tensor is saved as float16 or bfloat16 format but the size becomes halved.
+    Metadata such as fp8_scale is saved into inp dict for communication.
+    """
+    if inp is None:
+        return
+    # In pipeline parallelism, when inp is torch.Tensor, it only contains one element, thus can be omitted.
+    if type(inp) == torch.Tensor:
+        return
+
+    assert 'hidden_states' in inp, 'required by pipeline parallelism.'
+    inp_tensor = inp["hidden_states"]
+
+    min_val, max_val = inp_tensor.aminmax()
+    amax = torch.maximum(min_val.abs(), max_val.abs())
+
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    if amax > finfo.max:
+        fp8_type = torch.float8_e5m2
+        fp8_view_type = torch.float16
+    else:
+        fp8_type = torch.float8_e4m3fn
+        fp8_view_type = torch.bfloat16
+
+    finfo = torch.finfo(fp8_type)
+    scale = torch.tensor(1.0).to(inp_tensor.device) if amax == 0.0 else finfo.max / amax.float()
+    q_tensor = (inp_tensor.data.float() * scale)
+    # Todo: Currently we use fp8_view_type <float16, bfloat16> to indicate which fp8 format is used. This is a temporary workaround due to 'Only support tensor for fast send'.
+    #  inp_tensor needs to be a float datatype to avoid error during gradient placement.
+    inp_tensor.data = q_tensor.to(fp8_type).view(fp8_view_type)
+
+    inp["fp8_scale"] = scale.float().reciprocal()
+
+
+
+def cast_from_fp8_pipeline(inp: Any, del_metadata=True) -> None:
+    """
+    Cast the FP8 encoded hidden_states tensor back to original dtype after p2p communication in pipeline.
+    del_metadata = False is useful when this function is called before p2p communication.
+    """
+    if inp is None:
+        return
+    if type(inp) == torch.Tensor:
+        return
+
+    assert 'hidden_states' in inp, 'required by pipeline parallelism.'
+    inp_tensor = inp["hidden_states"]
+    scale = inp["fp8_scale"]
+
+    fp8_view_type = inp_tensor.dtype
+    if fp8_view_type == torch.float16:
+        fp8_type = torch.float8_e5m2
+    elif fp8_view_type == torch.bfloat16:
+        fp8_type = torch.float8_e4m3fn
+    else:
+        raise TypeError("Only float16, bfloat16 are implemented.")
+
+    inp_tensor.data = inp_tensor.data.view(fp8_type).to(torch.float16) * scale
+
+    if del_metadata:
+        del inp["fp8_scale"]
\ No newline at end of file

From 66018749f3fd79ff92e36b6fa39262f4c6355872 Mon Sep 17 00:00:00 2001
From: BurkeHulk <hangxu0304@gmail.com>
Date: Fri, 12 Jul 2024 15:26:17 +0800
Subject: [PATCH 006/167] add fp8_communication flag in the script

---
 examples/language/bert/finetune.py                  | 2 ++
 examples/language/gpt/hybridparallelism/finetune.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/examples/language/bert/finetune.py b/examples/language/bert/finetune.py
index 7e8c07fdce47..8a59ab6838a6 100644
--- a/examples/language/bert/finetune.py
+++ b/examples/language/bert/finetune.py
@@ -190,6 +190,7 @@ def main():
     )
     parser.add_argument("--target_f1", type=float, default=None, help="target f1 score. Raise exception if not reached")
     parser.add_argument("--use_lazy_init", type=bool, default=False, help="for initiating lazy init context")
+    parser.add_argument("--use_fp8_comm", type=bool, default=False, help="for using fp8 during communication")
     args = parser.parse_args()
 
     if args.model_type == "bert":
@@ -232,6 +233,7 @@ def main():
             zero_stage=1,
             precision="fp16",
             initial_scale=1,
+            fp8_communication=args.use_fp8_comm,
         )
 
     booster = Booster(plugin=plugin, **booster_kwargs)
diff --git a/examples/language/gpt/hybridparallelism/finetune.py b/examples/language/gpt/hybridparallelism/finetune.py
index 777d16cb9ea0..9b3a101609dc 100644
--- a/examples/language/gpt/hybridparallelism/finetune.py
+++ b/examples/language/gpt/hybridparallelism/finetune.py
@@ -187,6 +187,7 @@ def main():
     )
     parser.add_argument("--target_f1", type=float, default=None, help="target f1 score. Raise exception if not reached")
     parser.add_argument("--use_lazy_init", type=bool, default=False, help="for initiating lazy init context")
+    parser.add_argument("--use_fp8_comm", type=bool, default=False, help="for using fp8 during communication")
     args = parser.parse_args()
 
     if args.model_type == "gpt2":
@@ -225,6 +226,7 @@ def main():
             zero_stage=1,
             precision="fp16",
             initial_scale=1,
+            fp8_communication=args.use_fp8_comm,
         )
 
     booster = Booster(plugin=plugin, **booster_kwargs)

From 51f916b11d87ecdfa3763da7a6b396a030b32b13 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 12 Jul 2024 07:33:44 +0000
Subject: [PATCH 007/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 colossalai/pipeline/schedule/interleaved_pp.py |  3 ++-
 colossalai/pipeline/schedule/one_f_one_b.py    |  3 ++-
 colossalai/quantization/fp8.py                 | 12 +++++-------
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/colossalai/pipeline/schedule/interleaved_pp.py b/colossalai/pipeline/schedule/interleaved_pp.py
index 86ce536d0a3c..a7571c73139b 100644
--- a/colossalai/pipeline/schedule/interleaved_pp.py
+++ b/colossalai/pipeline/schedule/interleaved_pp.py
@@ -11,8 +11,8 @@
 from colossalai.interface import OptimizerWrapper
 from colossalai.pipeline.p2p import PipelineP2PCommunication, create_send_metadata
 from colossalai.pipeline.stage_manager import PipelineStageManager
+from colossalai.quantization.fp8 import cast_from_fp8_pipeline, cast_to_fp8_pipeline
 from colossalai.utils import get_current_device
-from colossalai.quantization.fp8 import cast_to_fp8_pipeline, cast_from_fp8_pipeline
 
 from ._utils import detach, get_batch_size, get_micro_batch, merge_batch, model_forward, retain_grad, to_device
 from .base import PipelineSchedule
@@ -59,6 +59,7 @@ def __init__(
         self.grad_metadata_recv = None
 
         self.fp8_communication = fp8_communication
+
     def load_batch(self, data_iter: Iterable, device: Optional[torch.device] = None) -> None:
         """Load a batch from data iterator.
 
diff --git a/colossalai/pipeline/schedule/one_f_one_b.py b/colossalai/pipeline/schedule/one_f_one_b.py
index 90ebb0534497..3269d67ba7d4 100644
--- a/colossalai/pipeline/schedule/one_f_one_b.py
+++ b/colossalai/pipeline/schedule/one_f_one_b.py
@@ -10,8 +10,8 @@
 from colossalai.interface import ModelWrapper, OptimizerWrapper
 from colossalai.pipeline.p2p import PipelineP2PCommunication, create_send_metadata
 from colossalai.pipeline.stage_manager import PipelineStageManager
+from colossalai.quantization.fp8 import cast_from_fp8_pipeline, cast_to_fp8_pipeline
 from colossalai.utils import get_current_device
-from colossalai.quantization.fp8 import cast_to_fp8_pipeline, cast_from_fp8_pipeline
 
 from ._utils import (
     detach,
@@ -172,6 +172,7 @@ def send_forward(self, output_tensor: Any, next_rank: int = None) -> None:
 
             if self.fp8_communication:
                 cast_from_fp8_pipeline(output_tensor, del_metadata=False)
+
     def send_backward(self, input_tensor_grad: Any, prev_rank: int = None) -> None:
         """Sends the gradient tensor to the previous stage in pipeline.
            For 1F1B.
diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index c02223331163..e514f435eaed 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -1,4 +1,4 @@
-from typing import Any, Callable, List, Optional, Tuple, Union
+from typing import Any
 
 import torch
 import torch.distributed as dist
@@ -107,7 +107,6 @@ def all_reduce_fp8(tensor: torch.Tensor, fp8_format="e4m3") -> None:
     tensor.data = tensor_out.view(input_shape).to(input_type)
 
 
-
 def cast_to_fp8_pipeline(inp: Any) -> None:
     """
     Cast the hidden_states tensor of inp object to fp8 format before p2p communication in pipeline.
@@ -121,7 +120,7 @@ def cast_to_fp8_pipeline(inp: Any) -> None:
     if type(inp) == torch.Tensor:
         return
 
-    assert 'hidden_states' in inp, 'required by pipeline parallelism.'
+    assert "hidden_states" in inp, "required by pipeline parallelism."
     inp_tensor = inp["hidden_states"]
 
     min_val, max_val = inp_tensor.aminmax()
@@ -137,7 +136,7 @@ def cast_to_fp8_pipeline(inp: Any) -> None:
 
     finfo = torch.finfo(fp8_type)
     scale = torch.tensor(1.0).to(inp_tensor.device) if amax == 0.0 else finfo.max / amax.float()
-    q_tensor = (inp_tensor.data.float() * scale)
+    q_tensor = inp_tensor.data.float() * scale
     # Todo: Currently we use fp8_view_type <float16, bfloat16> to indicate which fp8 format is used. This is a temporary workaround due to 'Only support tensor for fast send'.
     #  inp_tensor needs to be a float datatype to avoid error during gradient placement.
     inp_tensor.data = q_tensor.to(fp8_type).view(fp8_view_type)
@@ -145,7 +144,6 @@ def cast_to_fp8_pipeline(inp: Any) -> None:
     inp["fp8_scale"] = scale.float().reciprocal()
 
 
-
 def cast_from_fp8_pipeline(inp: Any, del_metadata=True) -> None:
     """
     Cast the FP8 encoded hidden_states tensor back to original dtype after p2p communication in pipeline.
@@ -156,7 +154,7 @@ def cast_from_fp8_pipeline(inp: Any, del_metadata=True) -> None:
     if type(inp) == torch.Tensor:
         return
 
-    assert 'hidden_states' in inp, 'required by pipeline parallelism.'
+    assert "hidden_states" in inp, "required by pipeline parallelism."
     inp_tensor = inp["hidden_states"]
     scale = inp["fp8_scale"]
 
@@ -171,4 +169,4 @@ def cast_from_fp8_pipeline(inp: Any, del_metadata=True) -> None:
     inp_tensor.data = inp_tensor.data.view(fp8_type).to(torch.float16) * scale
 
     if del_metadata:
-        del inp["fp8_scale"]
\ No newline at end of file
+        del inp["fp8_scale"]

From 457a0de79fd2d3602eba0ac78e606acb6401fc60 Mon Sep 17 00:00:00 2001
From: GuangyaoZhang <xjtu521@qq.com>
Date: Mon, 8 Jul 2024 07:04:48 +0000
Subject: [PATCH 008/167] shardformer fp8

---
 .../booster/plugin/hybrid_parallel_plugin.py  |   4 +-
 colossalai/params.py                          |   1 +
 colossalai/quantization/fp8.py                |  42 ++-
 colossalai/shardformer/layer/_operation.py    | 290 +++++++++++++-----
 colossalai/shardformer/layer/linear.py        |  18 +-
 .../shardformer/layer/qkv_fused_linear.py     |  43 ++-
 colossalai/shardformer/modeling/gpt2.py       |   2 +
 colossalai/shardformer/modeling/llama.py      |  26 +-
 colossalai/shardformer/policies/gpt2.py       |  10 +-
 colossalai/shardformer/policies/llama.py      |  14 +-
 colossalai/shardformer/shard/shard_config.py  |   2 +
 examples/language/bert/finetune.py            |   5 +-
 examples/language/bert/test_ci.sh             |   2 +-
 .../gpt/hybridparallelism/finetune.py         |  11 +-
 .../test_model/test_shard_gpt2.py             |  78 +++--
 .../test_model/test_shard_llama.py            | 174 ++++++-----
 16 files changed, 504 insertions(+), 218 deletions(-)
 create mode 100644 colossalai/params.py

diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index b818209a6668..c210ca91e68a 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -945,7 +945,8 @@ class HybridParallelPlugin(PipelinePluginBase):
         gradient_checkpoint_config (GradientCheckpointConfig, optional): Configuration for gradient checkpointing. Defaults to None.
         enable_metadata_cache (bool, optional): Whether to enable metadata cache for pipeline parallelism. Defaults to True.
         make_vocab_size_divisible_by (int, optional): it's used when padding the vocabulary size, to make it choose an faster kenel. Default to 64.
-        overlap_p2p (bool, optional): Whether to overlap the p2p communication in pipeline parallelism
+        overlap_p2p (bool, optional): Whether to overlap the p2p communication in pipeline parallelism.
+        fp8_communication (bool, optional): Whether to enable fp8 communication in model parallelism
     """
 
     def __init__(
@@ -1119,6 +1120,7 @@ def __init__(
             parallel_output=parallel_output,
             make_vocab_size_divisible_by=make_vocab_size_divisible_by,
             gradient_checkpoint_config=gradient_checkpoint_config,
+            fp8_communication=fp8_communication,
         )
         self.amp_config = dict(
             initial_scale=initial_scale,
diff --git a/colossalai/params.py b/colossalai/params.py
new file mode 100644
index 000000000000..4058ce430834
--- /dev/null
+++ b/colossalai/params.py
@@ -0,0 +1 @@
+to_cast = []
diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index e514f435eaed..66bcd0295fb6 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -12,7 +12,6 @@ def cast_to_fp8(inp: torch.Tensor, fp8_format="e4m3") -> (torch.Tensor, torch.Te
         scale: scaling factor for fp8 casting. If it is None, then it is computed automatically. Per-channel scaling
         is applied if input tensor is 2 dimension, otherwise, per-tensor scaling is applied.
         fp8_format: e4m3 or e5m2
-
     Returns:
         Tuples: A tuple (fp8_tensor, scale)
     """
@@ -39,12 +38,10 @@ def cast_to_fp8(inp: torch.Tensor, fp8_format="e4m3") -> (torch.Tensor, torch.Te
 
 def cast_from_fp8(inp: torch.Tensor, scale_inv: torch.Tensor, ret_type: torch.dtype) -> torch.Tensor:
     r"""
-
     Args:
         inp: should be a fp8 torch tensor in one of the types: [torch.float8_e4m3fn, torch.float8_e5m2].
         scale: scaling factor returned by cast_to_fp8 function.
         ret_type: the datatype of the returned tensor.
-
     Returns:
         torch.Tensor
     """
@@ -62,11 +59,9 @@ def all_reduce_fp8(tensor: torch.Tensor, fp8_format="e4m3") -> None:
     r"""
     This is an in-place operation for compressed all_reduce using fp8.
     It works like dist.all_reduce but during communication the data is cast to fp8 format.
-
     Args:
         tensor: torch.Tensor in fp32, fp16, bf16 datatype.
         fp8_format: e4m3 or e5m2
-
     Returns:
         None
     """
@@ -170,3 +165,40 @@ def cast_from_fp8_pipeline(inp: Any, del_metadata=True) -> None:
 
     if del_metadata:
         del inp["fp8_scale"]
+
+
+def reduce_scatter_fp8(output: torch.Tensor, input_list, group, fp8_format="e4m3") -> None:
+    r"""
+    This is an in-place operation for compressed all_reduce using fp8.
+    It works like dist.all_reduce but during communication the data is cast to fp8 format.
+
+    Args:
+        tensor: torch.Tensor in fp32, fp16, bf16 datatype.
+        fp8_format: e4m3 or e5m2
+
+    Returns:
+        None
+    """
+
+    input_type = output.dtype
+
+    fp8_type = torch.float8_e4m3fn if fp8_format == "e4m3" else torch.float8_e5m2
+    scale_list = []
+    cast_input_list = []
+    output_chunks = []
+    output_scale_list = []
+    for input in input_list:
+        ret, scale = cast_to_fp8(input, fp8_format=fp8_format)
+        scale_list.append(scale)
+        ret = ret.view(torch.uint8)
+        cast_input_list.append(ret)
+        output_chunks.append(torch.empty_like(ret))
+        output_scale_list.append(torch.empty_like(scale))
+    dist.all_to_all(output_chunks, cast_input_list, group=group)
+    dist.all_to_all(output_scale_list, scale_list, group=group)
+
+    summed_out = torch.zeros_like(output_chunks[0]).to(input_type)
+    for scale, out in zip(output_scale_list, output_chunks):
+        out = out.view(fp8_type)
+        summed_out += cast_from_fp8(out, scale, input_type)
+    output.data = summed_out
diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index 82d37bb4cf94..12ed1d409310 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -14,6 +14,8 @@
 except ImportError:
     _grad_accum_fusion_available = False
 
+from colossalai.quantization.fp8 import all_reduce_fp8, cast_from_fp8, cast_to_fp8, reduce_scatter_fp8
+
 
 class FusedLayerNormAffineFunction1D(torch.autograd.Function):
     r"""Layernorm
@@ -59,11 +61,12 @@ class MatmulWithAsyncCommunication(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, input_, weight, bias, process_group, async_grad_allreduce):
+    def forward(ctx, input_, weight, bias, process_group, async_grad_allreduce, fp8_communication=False):
         ctx.save_for_backward(input_, weight, bias)
         ctx.use_bias = bias is not None
         ctx.process_group = process_group
         ctx.async_grad_allreduce = async_grad_allreduce
+        ctx.fp8_communication = fp8_communication
 
         output = torch.matmul(input_, weight)
 
@@ -76,6 +79,7 @@ def forward(ctx, input_, weight, bias, process_group, async_grad_allreduce):
     def backward(ctx, grad_output):
         input, weight, bias = ctx.saved_tensors
         use_bias = ctx.use_bias
+        fp8_communication = ctx.fp8_communication
 
         # In order to be hooked into Gemini's '__torch_function__', adding a view operation to weight and bias.
         weight = weight.view(weight.shape)
@@ -90,7 +94,9 @@ def backward(ctx, grad_output):
             grad_output = grad_output.view(-1, grad_output.shape[-1])
             total_input = total_input.view(-1, total_input.shape[-1])
 
-        if ctx.async_grad_allreduce:
+        if fp8_communication and ctx.async_grad_allreduce:
+            _reduce(grad_input, group=ctx.process_group, fp8_communication=fp8_communication)
+        elif ctx.async_grad_allreduce:
             # Asynchronous all-reduce
             handle = dist.all_reduce(grad_input, group=ctx.process_group, async_op=True)
             # Relay on CUDA_DEVICE_MAX_CONNECTIONS=1 to have
@@ -99,10 +105,10 @@ def backward(ctx, grad_output):
         grad_weight = total_input.t().matmul(grad_output)
         grad_bias = grad_output.sum(dim=0) if use_bias else None
 
-        if ctx.async_grad_allreduce:
+        if ctx.async_grad_allreduce and not fp8_communication:
             handle.wait()
 
-        return grad_input, grad_weight, grad_bias, None, None, None
+        return grad_input, grad_weight, grad_bias, None, None, None, None
 
 
 class LinearWithAsyncCommunication(torch.autograd.Function):
@@ -111,11 +117,12 @@ class LinearWithAsyncCommunication(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, input_, weight, bias, process_group, async_grad_allreduce):
+    def forward(ctx, input_, weight, bias, process_group, async_grad_allreduce, fp8_communication=False):
         ctx.save_for_backward(input_, weight, bias)
         ctx.use_bias = bias is not None
         ctx.process_group = process_group
         ctx.async_grad_allreduce = async_grad_allreduce
+        ctx.fp8_communication = fp8_communication
         if bias is not None:
             output = F.linear(input_, weight, bias)
         else:
@@ -127,6 +134,7 @@ def forward(ctx, input_, weight, bias, process_group, async_grad_allreduce):
     def backward(ctx, grad_output):
         input, weight, bias = ctx.saved_tensors
         use_bias = ctx.use_bias
+        fp8_communication = ctx.fp8_communication
 
         # In order to be hooked into Gemini's '__torch_function__', adding a view operation to bias.
         if use_bias:
@@ -142,7 +150,10 @@ def backward(ctx, grad_output):
 
         if ctx.async_grad_allreduce:
             # Asynchronous all-reduce
-            handle = dist.all_reduce(grad_input, group=ctx.process_group, async_op=True)
+            if fp8_communication:
+                all_reduce_fp8(grad_input, group=ctx.process_group)
+            else:
+                handle = dist.all_reduce(grad_input, group=ctx.process_group, async_op=True)
             # Relay on CUDA_DEVICE_MAX_CONNECTIONS=1 to have
             # all-reduce scheduled first and have GPU resources allocated, CUDA_DEVICE_MAX_CONNECTIONS=1 is set in shardformer.py
 
@@ -161,10 +172,10 @@ def backward(ctx, grad_output):
 
         grad_bias = grad_output.sum(dim=0) if use_bias else None
 
-        if ctx.async_grad_allreduce:
+        if ctx.async_grad_allreduce and not fp8_communication:
             handle.wait()
 
-        return grad_input, grad_weight, grad_bias, None, None, None
+        return grad_input, grad_weight, grad_bias, None, None, None, None
 
 
 def _ring_as_gather(func, input_to_gather=None, input_local=None, process_group=None, gather_dim=1, keep_item=False):
@@ -232,17 +243,18 @@ class _GatherForwardReduceScatterBackward(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, input_, process_group, dim):
+    def forward(ctx, input_, process_group, dim, fp8_communication=False):
         ctx.process_group = process_group
         ctx.dim = dim
+        ctx.fp8_communication = fp8_communication
 
-        return _gather(input_, dim, process_group)
+        return _gather(input_, dim, process_group, fp8_communication)
 
     @staticmethod
     def backward(ctx, grad_output):
         dim = ctx.dim
         process_group = ctx.process_group
-
+        fp8_communication = ctx.fp8_communication
         # do reduce-scatter
         new_shape = list(grad_output.shape)
         assert (
@@ -253,9 +265,13 @@ def backward(ctx, grad_output):
             item.contiguous() for item in torch.chunk(grad_output, dist.get_world_size(process_group), dim=dim)
         ]
         output = torch.empty(new_shape, dtype=grad_output.dtype, device=grad_output.device)
-        dist.reduce_scatter(output, grad_list, group=process_group)
 
-        return output, None, None
+        if fp8_communication:
+            reduce_scatter_fp8(output, grad_list, group=process_group)
+        else:
+            dist.reduce_scatter(output, grad_list, group=process_group)
+
+        return output, None, None, None
 
 
 class _LinearWithGatherForwardReduceScatterBackward(torch.autograd.Function):
@@ -546,9 +562,10 @@ class _ReduceScatterForwardGatherBackward(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, input_, process_group, dim):
+    def forward(ctx, input_, process_group, dim, fp8_communication=False):
         ctx.dim = dim
         ctx.process_group = process_group
+        ctx.fp8_communication = fp8_communication
 
         # do reduce-scatter
         new_shape = list(input_.shape)
@@ -558,7 +575,11 @@ def forward(ctx, input_, process_group, dim):
         new_shape[dim] = new_shape[dim] // dist.get_world_size(process_group)
         input_list = [item.contiguous() for item in torch.chunk(input_, dist.get_world_size(process_group), dim=dim)]
         output = torch.empty(new_shape, dtype=input_.dtype, device=input_.device)
-        dist.reduce_scatter(output, input_list, group=process_group)
+        if fp8_communication:
+            # if False:
+            reduce_scatter_fp8(output, input_list, group=process_group)
+        else:
+            dist.reduce_scatter(output, input_list, group=process_group)
 
         return output
 
@@ -566,8 +587,9 @@ def forward(ctx, input_, process_group, dim):
     def backward(ctx, grad_output):
         dim = ctx.dim
         process_group = ctx.process_group
+        fp8_communication = ctx.fp8_communication
 
-        return _gather(grad_output, dim, process_group), None, None
+        return _gather(grad_output, dim, process_group, fp8_communication), None, None, None
 
 
 class _MatmulWithGatherForwardReduceScatterBackward(torch.autograd.Function):
@@ -582,13 +604,16 @@ class _MatmulWithGatherForwardReduceScatterBackward(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, input_, weight, bias, process_group, async_grad_reduce_scatter, dim, overlap, ring):
+    def forward(
+        ctx, input_, weight, bias, process_group, async_grad_reduce_scatter, dim, overlap, ring, fp8_communication
+    ):
         ctx.save_for_backward(input_, weight, bias)
         ctx.use_bias = bias is not None
         ctx.process_group = process_group
         ctx.async_grad_reduce_scatter = async_grad_reduce_scatter
         ctx.dim = dim
         ctx.overlap = overlap
+        ctx.fp8_communication = fp8_communication
 
         if ring is True:
             input_to_gather = {}
@@ -605,7 +630,7 @@ def forward(ctx, input_, weight, bias, process_group, async_grad_reduce_scatter,
             )
 
         else:
-            input_parallel = _gather(input_, dim, process_group)
+            input_parallel = _gather(input_, dim, process_group, fp8_communication)
 
             output = torch.matmul(input_parallel, weight)
 
@@ -620,6 +645,7 @@ def backward(ctx, grad_output):
         dim = ctx.dim
         process_group = ctx.process_group
         overlap = ctx.overlap
+        fp8_communication = ctx.fp8_communication
 
         # In order to be hooked into Gemini's '__torch_function__', adding a view operation to weight and bias. Used in FusedLayerNorm
         weight = weight.view(weight.shape)
@@ -627,7 +653,7 @@ def backward(ctx, grad_output):
             bias = bias.view(bias.shape)
 
         if not overlap:
-            input_parallel = _gather(input_, dim, process_group)
+            input_parallel = _gather(input_, dim, process_group, fp8_communication)
 
             total_input = input_parallel
             grad_input = grad_output.matmul(weight.T)
@@ -687,7 +713,7 @@ def backward(ctx, grad_output):
             # wait until reduce-scatter finished
             reducescatter_handle.wait()
 
-        return output, grad_weight, grad_bias, None, None, None, None, None
+        return output, grad_weight, grad_bias, None, None, None, None, None, None
 
 
 class _SplitForwardGatherBackward(torch.autograd.Function):
@@ -702,17 +728,20 @@ class _SplitForwardGatherBackward(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, input_, dim, process_group, grad_scale=None):
+    def forward(ctx, input_, dim, process_group, grad_scale=None, fp8_communication=False):
         ctx.process_group = process_group
         ctx.dim = dim
         ctx.grad_scale = grad_scale
+        ctx.fp8_communication = fp8_communication
         return _split(input_, dim, process_group)
 
     @staticmethod
     def backward(ctx, grad_output):
         if ctx.grad_scale is not None:
             grad_output = grad_output * ctx.grad_scale
-        return _gather(grad_output, ctx.dim, ctx.process_group), None, None, None
+
+        # to_cast.append(grad_output.cpu().detach().numpy())
+        return _gather(grad_output, ctx.dim, ctx.process_group, ctx.fp8_communication, "e4m3"), None, None, None, None
 
 
 class _ReduceForward(torch.autograd.Function):
@@ -725,12 +754,12 @@ class _ReduceForward(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, input_, process_group):
-        return _reduce(input_, process_group)
+    def forward(ctx, input_, process_group, fp8_communication=False):
+        return _reduce(input_, process_group, fp8_communication)
 
     @staticmethod
     def backward(ctx, grad_output):
-        return grad_output, None
+        return grad_output, None, None
 
 
 class _ReduceBackward(torch.autograd.Function):
@@ -743,13 +772,15 @@ class _ReduceBackward(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, input_, process_group):
+    def forward(ctx, input_, process_group, fp8_communication=False):
         ctx.process_group = process_group
+        ctx.fp8_communication = fp8_communication
         return input_
 
     @staticmethod
     def backward(ctx, grad_output):
-        return _reduce(grad_output, ctx.process_group), None
+        fp8_communication = ctx.fp8_communication
+        return _reduce(grad_output, ctx.process_group, fp8_communication), None, None
 
 
 class _GatherForwardSplitBackward(torch.autograd.Function):
@@ -762,17 +793,18 @@ class _GatherForwardSplitBackward(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, input_, dim, process_group, grad_scale=None):
+    def forward(ctx, input_, dim, process_group, grad_scale=None, fp8_comm=False):
         ctx.process_group = process_group
         ctx.dim = dim
         ctx.grad_scale = grad_scale
-        return _gather(input_, dim, process_group)
+
+        return _gather(input_, dim, process_group, fp8_comm=fp8_comm, fp8_format="e4m3")
 
     @staticmethod
     def backward(ctx, grad_output):
         if ctx.grad_scale is not None:
             grad_output = grad_output * ctx.grad_scale
-        return _split(grad_output, ctx.dim, ctx.process_group), None, None, None
+        return _split(grad_output, ctx.dim, ctx.process_group), None, None, None, None
 
 
 class _AllToAll(torch.autograd.Function):
@@ -786,26 +818,43 @@ class _AllToAll(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, input_, process_group, scatter_dim, gather_dim):
+    def forward(ctx, input_, process_group, scatter_dim, gather_dim, fp8_communication):
         ctx.process_group = process_group
         ctx.scatter_dim = scatter_dim
         ctx.gather_dim = gather_dim
+        ctx.fp8_communication = fp8_communication
         world_size = dist.get_world_size(process_group)
         bsz, _, _ = input_.shape
 
         # using all_to_all_single when batch size is 1
         if bsz == 1:
-            return _all_to_all_single(input_, world_size, process_group, scatter_dim, gather_dim)
+            return _all_to_all_single(
+                input_, world_size, process_group, scatter_dim, gather_dim, fp8_comm=fp8_comm, fp8_format="e5m2"
+            )
         else:
-            return _all_to_all(input_, world_size, process_group, scatter_dim, gather_dim)
+            return _all_to_all(
+                input_, world_size, process_group, scatter_dim, gather_dim, fp8_comm=fp8_comm, fp8_format="e5m2"
+            )
 
     @staticmethod
-    def backward(ctx, *grad_output):
+    def backward(ctx, grad_output):
         process_group = ctx.process_group
         scatter_dim = ctx.gather_dim
         gather_dim = ctx.scatter_dim
-        return_grad = _AllToAll.apply(*grad_output, process_group, scatter_dim, gather_dim)
-        return (return_grad, None, None, None)
+        ctx.fp8_communication
+        world_size = dist.get_world_size(process_group)
+        bsz, _, _ = grad_output.shape
+
+        if bsz == 1:
+            return_grad = _all_to_all_single(
+                grad_output, world_size, process_group, scatter_dim, gather_dim, fp8_comm=fp8_comm, fp8_format="e5m2"
+            )
+        else:
+            return_grad = _all_to_all(
+                grad_output, world_size, process_group, scatter_dim, gather_dim, fp8_comm=fp8_comm, fp8_format="e5m2"
+            )
+
+        return (return_grad, None, None, None, None)
 
 
 class HookParameter(torch.autograd.Function):
@@ -831,12 +880,15 @@ def hook_parameter_in_backward(input, weight=None, bias=None):
     return HookParameter.apply(input, weight, bias)
 
 
-def _reduce(input_, process_group):
+def _reduce(input_, process_group, fp8_communication=False):
     # skip if only one rank involved
     if dist.get_world_size(process_group) == 1:
         return input_
     else:
-        dist.all_reduce(input_, group=process_group)
+        if fp8_communication:
+            all_reduce_fp8(input_, group=process_group)
+        else:
+            dist.all_reduce(input_, group=process_group)
         return input_
 
 
@@ -860,19 +912,78 @@ def _split(input_, dim=-1, process_group=None):
     return output
 
 
-def _gather(input_, dim=-1, process_group=None):
+from colossalai.params import to_cast
+
+
+def _gather(input_, dim=-1, process_group=None, fp8_comm=False, fp8_format="e4m3"):
     # skip if only one rank involved
     world_size = dist.get_world_size(process_group)
     if world_size == 1:
         return input_
 
     # all gather
-    input_ = input_.contiguous()
-    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
-    torch.distributed.all_gather(tensor_list, input_, group=process_group)
+    import torch.distributed as dista
+
+    from colossalai.zero.low_level._utils import has_inf_or_nan
+
+    if fp8_comm:
+        # if False:
+        if has_inf_or_nan(input_):
+            print("input has nan")
+            exit(0)
+        input_type = input_.dtype
+        to_cast.append(input_)
+        ret, scale = cast_to_fp8(input_, fp8_format="e5m2")
+        if has_inf_or_nan(ret):
+            import pdb
+
+            pdb.set_trace()
+            print("cast has nan")
+            # exit(0)
+        dista.barrier()
+        fp8_type = ret.dtype
+        input_ = ret.view(torch.uint8)
+        input_ = input_.contiguous()
+        tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+        scale = torch.tensor(scale, dtype=torch.float32).to(input_.device)
+        # import torch.distributed as dista
+        # if dista.get_rank()==0:
+        #     import pdb
+        #     pdb.set_trace()
+        # dista.barrier()
+        scale_list = [torch.ones(1, dtype=torch.float32, device=input_.device) for _ in range(world_size)]
+
+        scale = torch.tensor(scale).to(input_.device)
+        torch.distributed.all_gather(tensor_list, input_, group=process_group)
+        torch.distributed.all_gather(scale_list, scale, group=process_group)
+
+        cast_tensor_list = []
+        for output, scale in zip(tensor_list, scale_list):
+            output = output.view(fp8_type)
+            output = cast_from_fp8(output, scale, input_type)
+            if has_inf_or_nan(output) and dista.get_rank() == 0:
+                print("casted_output has nan")
+                import pdb
+
+                pdb.set_trace()
+            dista.barrier()
+
+            cast_tensor_list.append(output)
+
+        output = torch.cat(cast_tensor_list, dim=dim).contiguous()
+
+        if has_inf_or_nan(output):
+            print("output has nan")
+            exit(0)
+            # import pdb
+            # pdb.set_trace()
+        dista.barrier()
 
-    # concat
-    output = torch.cat(tensor_list, dim=dim).contiguous()
+    else:
+        input_ = input_.contiguous()
+        tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+        torch.distributed.all_gather(tensor_list, input_, group=process_group)
+        output = torch.cat(tensor_list, dim=dim).contiguous()
 
     return output
 
@@ -901,14 +1012,31 @@ def _reduce_scatter(input_, dim=1, process_group=None):
     return output
 
 
-def _all_to_all(input_, world_size, group, scatter_dim, gather_dim):
-    input_list = [t.contiguous() for t in torch.tensor_split(input_, world_size, scatter_dim)]
-    output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)]
-    dist.all_to_all(output_list, input_list, group=group)
+def _all_to_all(input_, world_size, group, scatter_dim, gather_dim, fp8_comm=False, fp8_format="e5m2"):
+    if fp8_comm:
+        input_type = input_.dtype
+        ret, scale = cast_to_fp8(input_, fp8_format=fp8_format)
+        fp8_type = ret.dtype
+        input_ = ret.view(torch.uint8)
+        input_list = [t.contiguous() for t in torch.tensor_split(input_, world_size, scatter_dim)]
+        output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)]
+        scale_list = [torch.ones(1, dtype=scale.dtype, device=input_.device) for _ in range(world_size)]
+        dist.all_to_all(output_list, input_list, group=group)
+        dist.all_gather(scale_list, scale, group=group)
+        cast_tensor_list = []
+        for output, scale in zip(output_list, scale_list):
+            output = output.view(fp8_type)
+            output = cast_from_fp8(output, scale, input_type)
+            cast_tensor_list.append(output)
+        output_list = cast_tensor_list
+    else:
+        input_list = [t.contiguous() for t in torch.tensor_split(input_, world_size, scatter_dim)]
+        output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)]
+        dist.all_to_all(output_list, input_list, group=group)
     return torch.cat(output_list, dim=gather_dim).contiguous()
 
 
-def _all_to_all_single(input_, seq_world_size, group, scatter_dim, gather_dim):
+def _all_to_all_single(input_, seq_world_size, group, scatter_dim, gather_dim, fp8_comm=False, fp8_format="e5m2"):
     inp_shape = list(input_.shape)
     inp_shape[scatter_dim] = inp_shape[scatter_dim] // seq_world_size
     if scatter_dim < 2:
@@ -920,8 +1048,24 @@ def _all_to_all_single(input_, seq_world_size, group, scatter_dim, gather_dim):
             .contiguous()
         )
 
-    output = torch.empty_like(input_t)
-    dist.all_to_all_single(output, input_t, group=group)
+    if fp8_comm:
+        input_type = input_t.dtype
+        ret, scale = cast_to_fp8(input_t, fp8_format=fp8_format)
+        fp8_type = ret.dtype
+        input_t = ret.view(torch.uint8)
+        output = torch.empty_like(input_t)
+        scale_list = [torch.ones(1, dtype=scale.dtype, device=input_.device) for _ in range(seq_world_size)]
+        dist.all_to_all_single(output, input_t, group=group)
+        dist.all_gather(scale_list, scale, group=group)
+        cast_tensor_list = []
+        for output_part, scale in zip(output, scale_list):
+            output_part = output_part.view(fp8_type)
+            output_part = cast_from_fp8(output_part, scale, input_type)
+            cast_tensor_list.append(output_part)
+        output = torch.stack(cast_tensor_list, dim=0)
+    else:
+        output = torch.empty_like(input_t)
+        dist.all_to_all_single(output, input_t, group=group)
 
     if scatter_dim < 2:
         output = output.transpose(0, 1).contiguous()
@@ -935,12 +1079,16 @@ def _all_to_all_single(input_, seq_world_size, group, scatter_dim, gather_dim):
     ).contiguous()
 
 
-def matmul_with_async_comm(input_, weight, bias, process_group, async_grad_allreduce):
-    return MatmulWithAsyncCommunication.apply(input_, weight, bias, process_group, async_grad_allreduce)
+def matmul_with_async_comm(input_, weight, bias, process_group, async_grad_allreduce, fp8_communication=False):
+    return MatmulWithAsyncCommunication.apply(
+        input_, weight, bias, process_group, async_grad_allreduce, fp8_communication
+    )
 
 
-def linear_with_async_comm(input_, weight, bias, process_group, async_grad_allreduce):
-    return LinearWithAsyncCommunication.apply(input_, weight, bias, process_group, async_grad_allreduce)
+def linear_with_async_comm(input_, weight, bias, process_group, async_grad_allreduce, fp8_communication=False):
+    return LinearWithAsyncCommunication.apply(
+        input_, weight, bias, process_group, async_grad_allreduce, fp8_communication
+    )
 
 
 def linear_gather_forward_reducescatter_backward(
@@ -951,12 +1099,12 @@ def linear_gather_forward_reducescatter_backward(
     )
 
 
-def gather_forward_reducescatter_backward(input_, process_group, dim):
-    return _GatherForwardReduceScatterBackward.apply(input_, process_group, dim)
+def gather_forward_reducescatter_backward(input_, process_group, dim, fp8_communication=False):
+    return _GatherForwardReduceScatterBackward.apply(input_, process_group, dim, fp8_communication)
 
 
-def reducescatter_forward_gather_backward(input_, process_group, dim):
-    return _ReduceScatterForwardGatherBackward.apply(input_, process_group, dim)
+def reducescatter_forward_gather_backward(input_, process_group, dim, fp8_communication=False):
+    return _ReduceScatterForwardGatherBackward.apply(input_, process_group, dim, fp8_communication)
 
 
 def linear_reducescatter_forward_gather_backward(input_, weight, bias=None, process_group=None, dim=1, ring=False):
@@ -964,28 +1112,28 @@ def linear_reducescatter_forward_gather_backward(input_, weight, bias=None, proc
 
 
 def matmul_gather_forward_reducescatter_backward(
-    input_, weight, bias, process_group, async_grad_reduce_scatter, dim, overlap, ring=False
+    input_, weight, bias, process_group, async_grad_reduce_scatter, dim, overlap, ring=False, fp8_communication=False
 ):
     return _MatmulWithGatherForwardReduceScatterBackward.apply(
-        input_, weight, bias, process_group, async_grad_reduce_scatter, dim, overlap, ring
+        input_, weight, bias, process_group, async_grad_reduce_scatter, dim, overlap, ring, fp8_communication
     )
 
 
-def gather_forward_split_backward(input_, dim, process_group, grad_scale=None):
-    return _GatherForwardSplitBackward.apply(input_, dim, process_group, grad_scale)
+def gather_forward_split_backward(input_, dim, process_group, grad_scale=None, fp8_communication=False):
+    return _GatherForwardSplitBackward.apply(input_, dim, process_group, grad_scale, fp8_communication)
 
 
-def split_forward_gather_backward(input_, dim, process_group, grad_scale=None):
-    return _SplitForwardGatherBackward.apply(input_, dim, process_group, grad_scale)
+def split_forward_gather_backward(input_, dim, process_group, grad_scale=None, fp8_communication=False):
+    return _SplitForwardGatherBackward.apply(input_, dim, process_group, grad_scale, fp8_communication)
 
 
-def reduce_forward(input_, process_group):
-    return _ReduceForward.apply(input_, process_group)
+def reduce_forward(input_, process_group, fp8_communication=False):
+    return _ReduceForward.apply(input_, process_group, fp8_communication)
 
 
-def reduce_backward(input_, process_group):
-    return _ReduceBackward.apply(input_, process_group)
+def reduce_backward(input_, process_group, fp8_communication=False):
+    return _ReduceBackward.apply(input_, process_group, fp8_communication=fp8_communication)
 
 
-def all_to_all_comm(input_, process_group=None, scatter_dim=2, gather_dim=1):
-    return _AllToAll.apply(input_, process_group, scatter_dim, gather_dim)
+def all_to_all_comm(input_, process_group=None, scatter_dim=2, gather_dim=1, fp8_comm=False):
+    return _AllToAll.apply(input_, process_group, scatter_dim, gather_dim, fp8_comm)
diff --git a/colossalai/shardformer/layer/linear.py b/colossalai/shardformer/layer/linear.py
index 37c7542416f6..af25c398b994 100644
--- a/colossalai/shardformer/layer/linear.py
+++ b/colossalai/shardformer/layer/linear.py
@@ -84,6 +84,7 @@ def __init__(
         bias_: Optional[Parameter] = None,
         weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
         bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
+        fp8_communication: bool = False,
         **kwargs,
     ):
         super().__init__(weight=weight, bias_=bias_, **kwargs)
@@ -98,6 +99,7 @@ def __init__(
         self.skip_bias_add = skip_bias_add
         self.device = device
         self.process_group = process_group
+        self.fp8_communication = fp8_communication
 
         if skip_bias_add and not bias:
             raise ValueError("cannot skip bias addition if bias is None")
@@ -201,10 +203,12 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
         bias = self.bias if not self.skip_bias_add else None
 
         if self.seq_parallel_mode is None:
-            output_parallel = linear_with_async_comm(input_parallel, self.weight, bias, self.process_group, True)
+            output_parallel = linear_with_async_comm(
+                input_parallel, self.weight, bias, self.process_group, True, fp8_communication=self.fp8_communication
+            )
         elif self.seq_parallel_mode == "split_gather":
             input_parallel = gather_forward_reducescatter_backward(
-                input_parallel, self.process_group, self.seq_parallel_dim
+                input_parallel, self.process_group, self.seq_parallel_dim, fp8_communication=self.fp8_communication
             )
             output_parallel = linear_with_async_comm(input_parallel, self.weight, bias, self.process_group, False)
         elif self.seq_parallel_mode == "ring":
@@ -264,6 +268,7 @@ def __init__(
         weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
         bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
         stream_chunk_num: int = 1,
+        fp8_communication: bool = False,
     ):
         super().__init__()
 
@@ -278,6 +283,7 @@ def __init__(
         self.seq_parallel_mode = seq_parallel_mode
         self.seq_parallel_dim = seq_parallel_dim
         self.num_partitions = dist.get_world_size(self.process_group)
+        self.fp8_communication = fp8_communication
 
         if skip_bias_add and not bias:
             raise ValueError("cannot skip bias addition if bias is None")
@@ -398,7 +404,9 @@ def forward(self, input_: Tensor) -> Tensor:
             ), "Invalid shapes in Linear1D_Row forward: input={}, weight={}. Expected last dim of input {}.".format(
                 input_.shape, self.weight.shape, self.weight.shape[-1] * self.num_partitions
             )
-            input_ = split_forward_gather_backward(input_, dim=-1, process_group=self.process_group)
+            input_ = split_forward_gather_backward(
+                input_, dim=-1, process_group=self.process_group, fp8_comm=self.fp8_communication
+            )
 
         if self.stream_chunk_num > 1:
             if self.training:
@@ -418,11 +426,11 @@ def forward(self, input_: Tensor) -> Tensor:
         else:
             if self.seq_parallel_mode is None:
                 output_parallel = linear_with_async_comm(input_, self.weight, None, self.process_group, False)
-                output = reduce_forward(output_parallel, self.process_group)
+                output = reduce_forward(output_parallel, self.process_group, fp8_communication=self.fp8_communication)
             elif self.seq_parallel_mode == "split_gather":
                 output_parallel = linear_with_async_comm(input_, self.weight, None, self.process_group, False)
                 output = reducescatter_forward_gather_backward(
-                    output_parallel, self.process_group, self.seq_parallel_dim
+                    output_parallel, self.process_group, self.seq_parallel_dim, fp8_communication=self.fp8_communication
                 )
             elif self.seq_parallel_mode == "ring":
                 output = linear_reducescatter_forward_gather_backward(
diff --git a/colossalai/shardformer/layer/qkv_fused_linear.py b/colossalai/shardformer/layer/qkv_fused_linear.py
index 0f6595a7c4d6..d8425b58db4f 100644
--- a/colossalai/shardformer/layer/qkv_fused_linear.py
+++ b/colossalai/shardformer/layer/qkv_fused_linear.py
@@ -183,6 +183,7 @@ def __init__(
         bias_: Optional[Parameter] = None,
         weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
         bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
+        fp8_communication: bool = False,
     ):
         super().__init__()
 
@@ -197,6 +198,7 @@ def __init__(
         self.n_fused = n_fused
         self.process_group = process_group
         self.async_communication = async_communication
+        self.fp8_communication = fp8_communication
 
         if skip_bias_add and not bias:
             raise ValueError("cannot skip bias addition if bias is None")
@@ -314,14 +316,26 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
 
         if self.seq_parallel_mode is None:
             # Set up backprop all-reduce.
-            input_parallel = reduce_backward(input_, self.process_group)
+            input_parallel = reduce_backward(input_, self.process_group, fp8_communication=self.fp8_communication)
             output_parallel = matmul_with_async_comm(
-                input_parallel, self.weight, bias, self.process_group, self.async_communication
+                input_parallel,
+                self.weight,
+                bias,
+                self.process_group,
+                self.async_communication,
+                fp8_communication=self.fp8_communication,
             )
         elif self.seq_parallel_mode == "split_gather":
             input_parallel = input_
             output_parallel = matmul_gather_forward_reducescatter_backward(
-                input_parallel, self.weight, bias, self.process_group, True, 1, self.overlap
+                input_parallel,
+                self.weight,
+                bias,
+                self.process_group,
+                True,
+                1,
+                self.overlap,
+                fp8_communication=self.fp8_communication,
             )
         elif self.seq_parallel_mode == "ring":
             input_parallel = input_
@@ -331,7 +345,9 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
 
         if self.gather_output:
             # All-gather across the partitions.
-            output = gather_forward_split_backward(output_parallel, dim=-1, process_group=self.process_group)
+            output = gather_forward_split_backward(
+                output_parallel, dim=-1, process_group=self.process_group, fp8_communication=self.fp8_communication
+            )
         else:
             output = output_parallel
 
@@ -379,6 +395,7 @@ def __init__(
         weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
         bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
         stream_chunk_num: int = 1,
+        fp8_communication: bool = False,
     ):
         super().__init__()
 
@@ -392,6 +409,7 @@ def __init__(
         self.process_group = process_group
         self.seq_parallel_mode = seq_parallel_mode
         self.num_partitions = dist.get_world_size(self.process_group)
+        self.fp8_communication = fp8_communication
 
         if skip_bias_add and not bias:
             raise ValueError("cannot skip bias addition if bias is None")
@@ -514,7 +532,9 @@ def forward(self, input_: Tensor) -> Tensor:
             ), "Invalid shapes in Linear1D_Row forward: input={}, weight={}. Expected last dim of input {}.".format(
                 input_.shape, self.weight.shape, self.weight.shape[0] * self.num_partitions
             )
-            input_ = split_forward_gather_backward(input_, dim=-1, process_group=self.process_group)
+            input_ = split_forward_gather_backward(
+                input_, dim=-1, process_group=self.process_group, fp8_communication=self.fp8_communication
+            )
 
         if self.stream_chunk_num > 1:
             if self.training:
@@ -535,13 +555,20 @@ def forward(self, input_: Tensor) -> Tensor:
         else:
             if self.seq_parallel_mode is None:
                 output_parallel = torch.matmul(input_, self.weight)
-                output = reduce_forward(output_parallel, self.process_group)
+                output = reduce_forward(output_parallel, self.process_group, self.fp8_communication)
             elif self.seq_parallel_mode == "split_gather":
                 output_parallel = torch.matmul(input_, self.weight)
-                output = reducescatter_forward_gather_backward(output_parallel, self.process_group, 1)
+                output = reducescatter_forward_gather_backward(
+                    output_parallel,
+                    self.process_group,
+                    1,
+                    self.fp8_communication,
+                )
             elif self.seq_parallel_mode == "ring":
                 output_parallel = torch.matmul(input_, self.weight)
-                output = reducescatter_forward_gather_backward(output_parallel, self.process_group, 1)
+                output = reducescatter_forward_gather_backward(
+                    output_parallel, self.process_group, 1, self.fp8_communication
+                )
 
         if not self.skip_bias_add:
             if self.bias is not None:
diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index aa75bab115a7..beaa47952c9f 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -1137,6 +1137,7 @@ def forward(
             hidden_states,
             dim=1,
             process_group=shard_config.sequence_parallel_process_group,
+            fp8_communication=shard_config.fp8_communication,
         )
 
         for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
@@ -1204,6 +1205,7 @@ def custom_forward(*inputs):
             hidden_states,
             dim=1,
             process_group=shard_config.sequence_parallel_process_group,
+            fp8_communication=shard_config.fp8_communication,
         )
 
         hidden_states = self.ln_f(hidden_states)
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index bf5ce45a8342..05c9fe3bd9c7 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -460,7 +460,7 @@ def llama_for_sequence_classification_forward(
             return {"hidden_states": hidden_states}
 
 
-def get_llama_flash_attention_forward(shard_config, sp_mode=None, sp_size=None, sp_group=None):
+def get_llama_flash_attention_forward(shard_config: ShardConfig, sp_mode=None, sp_size=None, sp_group=None):
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -510,9 +510,9 @@ def forward(
 
         # sp: all-to-all comminucation when introducing sequence parallel
         if sp_mode == "all_to_all":
-            query_states = all_to_all_comm(query_states, sp_group)
-            key_states = all_to_all_comm(key_states, sp_group)
-            value_states = all_to_all_comm(value_states, sp_group)
+            query_states = all_to_all_comm(query_states, sp_group, fp8_comm=shard_config.fp8_communication)
+            key_states = all_to_all_comm(key_states, sp_group, fp8_comm=shard_config.fp8_communication)
+            value_states = all_to_all_comm(value_states, sp_group, fp8_comm=shard_config.fp8_communication)
             bsz, q_len, _ = query_states.size()
 
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
@@ -592,7 +592,7 @@ def forward(
     return forward
 
 
-def get_llama_flash_attention_model_forward(shard_config, sp_mode=None, sp_size=None, sp_group=None):
+def get_llama_flash_attention_model_forward(shard_config: ShardConfig, sp_mode=None, sp_size=None, sp_group=None):
     logger = logging.get_logger(__name__)
 
     def forward(
@@ -659,9 +659,13 @@ def forward(
             attention_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
 
         if sp_mode in ["ring", "split_gather"]:
-            inputs_embeds = split_forward_gather_backward(inputs_embeds, 1, sp_group)
+            inputs_embeds = split_forward_gather_backward(
+                inputs_embeds, 1, sp_group, fp8_comm=shard_config.fp8_communication
+            )
         elif sp_mode == "all_to_all":
-            inputs_embeds = split_forward_gather_backward(inputs_embeds, 1, sp_group, 1 / sp_size)
+            inputs_embeds = split_forward_gather_backward(
+                inputs_embeds, 1, sp_group, 1 / sp_size, fp8_comm=shard_config.fp8_communication
+            )
         hidden_states = inputs_embeds
 
         # decoder layers
@@ -706,9 +710,13 @@ def forward(
         hidden_states = self.norm(hidden_states)
 
         if sp_mode == "ring" or sp_mode == "split_gather":
-            hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group)
+            hidden_states = gather_forward_split_backward(
+                hidden_states, 1, sp_group, fp8_comm=shard_config.fp8_communication
+            )
         elif sp_mode == "all_to_all":
-            hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group, grad_scale=sp_size)
+            hidden_states = gather_forward_split_backward(
+                hidden_states, 1, sp_group, grad_scale=sp_size, fp8_comm=shard_config.fp8_communication
+            )
 
         # add hidden states from the last decoder layer
         if output_hidden_states:
diff --git a/colossalai/shardformer/policies/gpt2.py b/colossalai/shardformer/policies/gpt2.py
index cfe20000a2bf..bb6269737da7 100644
--- a/colossalai/shardformer/policies/gpt2.py
+++ b/colossalai/shardformer/policies/gpt2.py
@@ -110,14 +110,13 @@ def module_policy(self):
                             "n_fused": 3,
                             "seq_parallel_mode": sp_mode,
                             "overlap": overlap,
+                            "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
                     SubModuleReplacementDescription(
                         suffix="attn.c_proj",
                         target_module=col_nn.GPT2FusedLinearConv1D_Row,
-                        kwargs={
-                            "seq_parallel_mode": sp_mode,
-                        },
+                        kwargs={"seq_parallel_mode": sp_mode, "fp8_communication": self.shard_config.fp8_communication},
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.c_fc",
@@ -127,14 +126,13 @@ def module_policy(self):
                             "seq_parallel_mode": sp_mode,
                             "overlap": overlap,
                             "skip_bias_add": self.enable_bias_gelu_fused,
+                            "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.c_proj",
                         target_module=col_nn.GPT2FusedLinearConv1D_Row,
-                        kwargs={
-                            "seq_parallel_mode": sp_mode,
-                        },
+                        kwargs={"seq_parallel_mode": sp_mode, "fp8_communication": self.shard_config.fp8_communication},
                     ),
                     SubModuleReplacementDescription(
                         suffix="attn.attn_dropout",
diff --git a/colossalai/shardformer/policies/llama.py b/colossalai/shardformer/policies/llama.py
index 85ec6717d0d1..be21b33e10a6 100644
--- a/colossalai/shardformer/policies/llama.py
+++ b/colossalai/shardformer/policies/llama.py
@@ -134,37 +134,37 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                     SubModuleReplacementDescription(
                         suffix="self_attn.q_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.k_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.v_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.o_proj",
                         target_module=Linear1D_Row,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.gate_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.up_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.down_proj",
                         target_module=Linear1D_Row,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                 ],
             )
diff --git a/colossalai/shardformer/shard/shard_config.py b/colossalai/shardformer/shard/shard_config.py
index b64300366fc3..7372e06c2444 100644
--- a/colossalai/shardformer/shard/shard_config.py
+++ b/colossalai/shardformer/shard/shard_config.py
@@ -29,6 +29,7 @@ class ShardConfig:
         enable_sequence_overlap (bool): Whether to turn on sequence overlap, which overlap the computation and communication in sequence parallelism. It can only be used when enable_sequence_parallelism is True. Defaults to False.
         gradient_checkpoint_config (Optional[GradientCheckpointConfig]): The gradient checkpoint config. Defaults to None.
         enable_all_optimization (bool): Whether to turn on all optimization tools including 'fused normalization', 'flash attention', 'JIT fused operators', 'sequence parallelism' and 'sequence overlap'. Defaults to False.
+        fp8_communication (bool, optional): Whether to enable fp8 communication in model parallelism. Defaults to False.
     """
 
     tensor_parallel_process_group: Optional[ProcessGroup] = None
@@ -47,6 +48,7 @@ class ShardConfig:
     gradient_checkpoint_config: Optional[GradientCheckpointConfig] = None
     extra_kwargs: Dict[str, Any] = field(default_factory=dict)
     ep_group: Optional[ProcessGroup] = None
+    fp8_communication: bool = False
     # pipeline_parallel_size: int
     # data_parallel_size: int
     # tensor_parallel_mode: Literal['1d', '2d', '2.5d', '3d']
diff --git a/examples/language/bert/finetune.py b/examples/language/bert/finetune.py
index 8a59ab6838a6..d9246ff49a9f 100644
--- a/examples/language/bert/finetune.py
+++ b/examples/language/bert/finetune.py
@@ -224,7 +224,10 @@ def main():
         # modify the param accordingly for finetuning test cases
         plugin = HybridParallelPlugin(
             tp_size=1,
-            pp_size=2,
+            pp_size=1,
+            sp_size=2,
+            enable_sequence_parallelism=True,
+            sequence_parallelism_mode="all_to_all",
             num_microbatches=None,
             pp_style="interleaved",
             num_model_chunks=2,
diff --git a/examples/language/bert/test_ci.sh b/examples/language/bert/test_ci.sh
index fc4eacf6fe79..de35e0fa5c53 100755
--- a/examples/language/bert/test_ci.sh
+++ b/examples/language/bert/test_ci.sh
@@ -5,7 +5,7 @@ pip install -r requirements.txt
 
 FAIL_LIMIT=3
 
-for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero" "hybrid_parallel"; do
+for plugin in "hybrid_parallel"; do
     for i in $(seq 1 $FAIL_LIMIT); do
         torchrun --standalone --nproc_per_node 4 finetune.py --target_f1 0.86 --plugin $plugin --model_type "bert" && break
         echo "Failed $i times"
diff --git a/examples/language/gpt/hybridparallelism/finetune.py b/examples/language/gpt/hybridparallelism/finetune.py
index 9b3a101609dc..ae80ddad5884 100644
--- a/examples/language/gpt/hybridparallelism/finetune.py
+++ b/examples/language/gpt/hybridparallelism/finetune.py
@@ -218,8 +218,11 @@ def main():
     elif args.plugin == "hybrid_parallel":
         # modify the param accordingly for finetuning test cases
         plugin = HybridParallelPlugin(
-            tp_size=1,
-            pp_size=2,
+            tp_size=2,
+            pp_size=1,
+            sp_size=2,
+            sequence_parallelism_mode="split_gather",
+            enable_sequence_parallelism=True,
             num_microbatches=None,
             microbatch_size=1,
             enable_all_optimization=True,
@@ -318,3 +321,7 @@ def _criterion(outputs, inputs):
 
 if __name__ == "__main__":
     main()
+    if dist.get_rank() == 0:
+        import pdb
+
+        pdb.set_trace()
diff --git a/tests/test_shardformer/test_model/test_shard_gpt2.py b/tests/test_shardformer/test_model/test_shard_gpt2.py
index f9e368c0ebf3..8fa0997d540c 100644
--- a/tests/test_shardformer/test_model/test_shard_gpt2.py
+++ b/tests/test_shardformer/test_model/test_shard_gpt2.py
@@ -51,7 +51,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
         if test_config["precision"] == "fp32":
             atol, rtol = 1e-4, 1e-3
         else:
-            atol, rtol = 5e-3, 5e-3
+            atol, rtol = 5e-2, 5e-2
         col_layer_grads = get_grad_tensors_for_check(
             gpt2,
             sharded_gpt2,
@@ -97,7 +97,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
         if test_config["precision"] == "fp32":
             atol, rtol = 1e-5, 1e-3
         else:
-            atol, rtol = 5e-3, 5e-3
+            atol, rtol = 5e-2, 5e-2
 
         if org_model.__class__.__name__ == "GPT2Model":
             check_output_hidden_state(org_output, sharded_output, stage_manager, atol=atol, rtol=rtol)
@@ -131,17 +131,47 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
 @parameterize(
     "test_config",
     [
-        {
-            "tp_size": 4,
-            "pp_size": 1,
-            "num_microbatches": 1,
-            "enable_sequence_parallelism": True,
-            "sequence_parallelism_mode": "ring",
-            "enable_flash_attention": False,
-            "use_lazy_init": True,
-            "precision": "fp32",
-            "initial_scale": 1,
-        },
+        # {
+        #     "tp_size": 4,
+        #     "pp_size": 1,
+        #     "num_microbatches": 1,
+        #     "enable_sequence_parallelism": True,
+        #     "sequence_parallelism_mode": "ring",
+        #     "enable_flash_attention": False,
+        #     "use_lazy_init": True,
+        #     "precision": "fp32",
+        #     "initial_scale": 1,
+        # },
+        # {
+        #     "tp_size": 4,
+        #     "pp_size": 1,
+        #     "num_microbatches": 1,
+        #     "enable_sequence_parallelism": True,
+        #     "sequence_parallelism_mode": "split_gather",
+        #     "enable_flash_attention": False,
+        #     "use_lazy_init": True,
+        #     "precision": "fp16",
+        #     "initial_scale": 1,
+        # },
+        # {
+        #     "tp_size": 2,
+        #     "pp_size": 2,
+        #     "num_microbatches": 4,
+        #     "enable_all_optimization": True,
+        #     "use_lazy_init": True,
+        #     "precision": "fp16",
+        #     "initial_scale": 1,
+        # },
+        # {
+        #     "tp_size": 1,
+        #     "pp_size": 2,
+        #     "num_microbatches": 2,
+        #     "enable_all_optimization": True,
+        #     "use_lazy_init": True,
+        #     "zero_stage": 1,
+        #     "precision": "fp16",
+        #     "initial_scale": 1,
+        # },
         {
             "tp_size": 4,
             "pp_size": 1,
@@ -152,25 +182,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "use_lazy_init": True,
             "precision": "fp16",
             "initial_scale": 1,
-        },
-        {
-            "tp_size": 2,
-            "pp_size": 2,
-            "num_microbatches": 4,
-            "enable_all_optimization": True,
-            "use_lazy_init": True,
-            "precision": "fp16",
-            "initial_scale": 1,
-        },
-        {
-            "tp_size": 1,
-            "pp_size": 2,
-            "num_microbatches": 2,
-            "enable_all_optimization": True,
-            "use_lazy_init": True,
-            "zero_stage": 1,
-            "precision": "fp16",
-            "initial_scale": 1,
+            "fp8_communication": True,
         },
     ],
 )
@@ -272,4 +284,4 @@ def test_gpt2_3d():
 
 if __name__ == "__main__":
     test_gpt2()
-    test_gpt2_3d()
+    # test_gpt2_3d()
diff --git a/tests/test_shardformer/test_model/test_shard_llama.py b/tests/test_shardformer/test_model/test_shard_llama.py
index 8fe18f69bcd1..d32b0684eb96 100644
--- a/tests/test_shardformer/test_model/test_shard_llama.py
+++ b/tests/test_shardformer/test_model/test_shard_llama.py
@@ -34,7 +34,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
     if enable_gradient_checkpointing:
         # org_model.gradient_checkpointing_enable()
         sharded_model.unwrap().gradient_checkpointing_enable()
-
     org_loss, org_output, sharded_loss, sharded_output = run_forward_backward_with_hybrid_plugin(
         org_model, sharded_model, sharded_optimizer, data_gen_fn, output_transform_fn, criterion, booster
     )
@@ -71,7 +70,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             )
             grad = grads[grad_index]
             sharded_grad = p1.grad.view(-1).chunk(dist.get_world_size())[dist.get_rank()]
-            assert_close(sharded_grad, grad[: sharded_grad.shape[0]], atol=5e-3, rtol=5e-3, check_dtype=False)
+            assert_close(sharded_grad, grad[: sharded_grad.shape[0]], atol=5e-2, rtol=5e-2, check_dtype=False)
 
     # Save gradient tensors for comparison between the original model and the sharded model before optimizer step.
     grads_to_check = {}
@@ -109,7 +108,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
         if test_config["precision"] == "fp32":
             atol, rtol = 1e-5, 1e-3
         else:
-            atol, rtol = 5e-3, 5e-3
+            atol, rtol = 5e-2, 5e-2
 
         if org_model.__class__.__name__ == "LlamaModel":
             check_output_hidden_state(org_output, sharded_output, stage_manager, atol=atol, rtol=rtol)
@@ -121,7 +120,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
         if test_config["precision"] == "fp32":
             atol, rtol = 1e-4, 1e-3
         else:
-            atol, rtol = 5e-3, 5e-3
+            atol, rtol = 5e-2, 5e-2
         try:
             check_weight(
                 llama_model,
@@ -146,104 +145,141 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
 @parameterize(
     "test_config",
     [
-        {  # Test ring + Flash attention
+        # {  # Test ring + Flash attention
+        #     "tp_size": 2,
+        #     "pp_size": 1,
+        #     "sp_size": 2,
+        #     "num_microbatches": 1,
+        #     "enable_sequence_parallelism": True,
+        #     "sequence_parallelism_mode": "ring",
+        #     "enable_flash_attention": True,
+        #     "use_lazy_init": True,
+        #     "zero_stage": 2,
+        #     "precision": "fp16",
+        #     "initial_scale": 1,
+        # },
+        # {  # Ulysess + Flash attention
+        #     "tp_size": 1,
+        #     "pp_size": 2,
+        #     "sp_size": 2,
+        #     "num_microbatches": 2,
+        #     "enable_sequence_parallelism": True,
+        #     "sequence_parallelism_mode": "all_to_all",
+        #     "enable_flash_attention": True,
+        #     "use_lazy_init": True,
+        #     "zero_stage": 1,
+        #     "precision": "fp16",
+        #     "initial_scale": 1,
+        # },
+        # {
+        #     "tp_size": 1,
+        #     "pp_size": 1,
+        #     "sp_size": 2,
+        #     "num_microbatches": 1,
+        #     "enable_sequence_parallelism": True,
+        #     "sequence_parallelism_mode": "all_to_all",
+        #     "use_lazy_init": True,
+        #     "zero_stage": 1,
+        #     "precision": "fp16",
+        #     "initial_scale": 1,
+        # },
+        # {
+        #     "tp_size": 4,
+        #     "pp_size": 1,
+        #     "num_microbatches": 1,
+        #     "enable_sequence_parallelism": True,
+        #     "sequence_parallelism_mode": "split_gather",
+        #     "enable_flash_attention": False,
+        #     "use_lazy_init": True,
+        #     "precision": "fp16",
+        #     "initial_scale": 1,
+        # },
+        # {
+        #     "tp_size": 2,
+        #     "pp_size": 2,
+        #     "num_microbatches": 2,
+        #     "enable_all_optimization": True,
+        #     "use_lazy_init": True,
+        #     "precision": "fp16",
+        #     "initial_scale": 1,
+        #     "enable_gradient_checkpointing": True,
+        #     "gradient_checkpoint_config": PipelineGradientCheckpointConfig(gradient_checkpointing_ratio=0.5),
+        # },
+        # {
+        #     "tp_size": 1,
+        #     "pp_size": 2,
+        #     "num_microbatches": 4,
+        #     "use_lazy_init": False,
+        #     "precision": "fp32",
+        #     "enable_gradient_checkpointing": True,
+        #     "gradient_checkpoint_config": PipelineGradientCheckpointConfig(num_ckpt_layers_per_stage=[4, 0]),
+        # },
+        # {
+        #     "tp_size": 2,
+        #     "pp_size": 1,
+        #     "enable_all_optimization": True,
+        #     "use_lazy_init": True,
+        #     "zero_stage": 2,
+        #     "precision": "fp16",
+        #     "initial_scale": 1,
+        # },
+        # {
+        #     "tp_size": 1,
+        #     "pp_size": 2,
+        #     "num_microbatches": 2,
+        #     "enable_all_optimization": True,
+        #     "use_lazy_init": True,
+        #     "zero_stage": 1,
+        #     "precision": "fp16",
+        #     "initial_scale": 1,
+        # },
+        {
             "tp_size": 2,
             "pp_size": 1,
             "sp_size": 2,
             "num_microbatches": 1,
             "enable_sequence_parallelism": True,
-            "sequence_parallelism_mode": "ring",
-            "enable_flash_attention": True,
-            "use_lazy_init": True,
-            "zero_stage": 2,
-            "precision": "fp16",
-            "initial_scale": 1,
-        },
-        {  # Ulysess + Flash attention
-            "tp_size": 1,
-            "pp_size": 2,
-            "sp_size": 2,
-            "num_microbatches": 2,
-            "enable_sequence_parallelism": True,
-            "sequence_parallelism_mode": "all_to_all",
-            "enable_flash_attention": True,
+            "sequence_parallelism_mode": "split_gather",
             "use_lazy_init": True,
             "zero_stage": 1,
             "precision": "fp16",
             "initial_scale": 1,
+            "fp8_communication": True,
         },
         {
-            "tp_size": 1,
+            "tp_size": 2,
             "pp_size": 1,
-            "sp_size": 2,
             "num_microbatches": 1,
-            "enable_sequence_parallelism": True,
-            "sequence_parallelism_mode": "all_to_all",
+            "enable_sequence_parallelism": False,
             "use_lazy_init": True,
             "zero_stage": 1,
             "precision": "fp16",
             "initial_scale": 1,
+            "fp8_communication": True,
         },
         {
-            "tp_size": 4,
+            "tp_size": 1,
             "pp_size": 1,
+            "sp_size": 2,
             "num_microbatches": 1,
             "enable_sequence_parallelism": True,
-            "sequence_parallelism_mode": "split_gather",
-            "enable_flash_attention": False,
-            "use_lazy_init": True,
-            "precision": "fp16",
-            "initial_scale": 1,
-        },
-        {
-            "tp_size": 2,
-            "pp_size": 2,
-            "num_microbatches": 2,
-            "enable_all_optimization": True,
-            "use_lazy_init": True,
-            "precision": "fp16",
-            "initial_scale": 1,
-            "enable_gradient_checkpointing": True,
-            "gradient_checkpoint_config": PipelineGradientCheckpointConfig(gradient_checkpointing_ratio=0.5),
-        },
-        {
-            "tp_size": 1,
-            "pp_size": 2,
-            "num_microbatches": 4,
-            "use_lazy_init": False,
-            "precision": "fp32",
-            "enable_gradient_checkpointing": True,
-            "gradient_checkpoint_config": PipelineGradientCheckpointConfig(num_ckpt_layers_per_stage=[4, 0]),
-        },
-        {
-            "tp_size": 2,
-            "pp_size": 1,
-            "enable_all_optimization": True,
-            "use_lazy_init": True,
-            "zero_stage": 2,
-            "precision": "fp16",
-            "initial_scale": 1,
-        },
-        {
-            "tp_size": 1,
-            "pp_size": 2,
-            "num_microbatches": 2,
-            "enable_all_optimization": True,
+            "sequence_parallelism_mode": "all_to_all",
             "use_lazy_init": True,
             "zero_stage": 1,
             "precision": "fp16",
             "initial_scale": 1,
+            "fp8_communication": True,
         },
     ],
 )
 def run_llama_test(test_config):
-    sub_model_zoo = model_zoo.get_sub_registry("transformers_llama")
+    sub_model_zoo = model_zoo.get_sub_registry("transformers_llama_for_sequence_classification")
 
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
         try:
             check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
         except Exception as e:
-            print(f"Failed config: {test_config}")
+            print(f"Failed config out: {test_config}")
             raise e
 
     clear_layout_converter()
@@ -291,7 +327,7 @@ def run_llama_test(test_config):
     ],
 )
 def run_llama_3d_test(test_config):
-    sub_model_zoo = model_zoo.get_sub_registry("transformers_llama")
+    sub_model_zoo = model_zoo.get_sub_registry("transformers_llama_for_sequence_classification")
 
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
         try:
@@ -333,4 +369,4 @@ def test_llama_3d():
 
 if __name__ == "__main__":
     test_llama()
-    test_llama_3d()
+    # test_llama_3d()

From 5a310b9ee177eab8335c044055eb2e6b55a62dac Mon Sep 17 00:00:00 2001
From: GuangyaoZhang <xjtu521@qq.com>
Date: Wed, 17 Jul 2024 02:56:07 +0000
Subject: [PATCH 009/167] fix rebase

---
 colossalai/params.py                          |   1 -
 colossalai/quantization/fp8.py                |  16 +-
 colossalai/shardformer/layer/_operation.py    | 100 +++++-----
 colossalai/shardformer/modeling/llama.py      |  24 ++-
 colossalai/shardformer/policies/llama.py      |  14 +-
 examples/language/bert/finetune.py            |   5 +-
 examples/language/bert/test_ci.sh             |   2 +-
 .../gpt/hybridparallelism/finetune.py         |  10 +-
 .../test_model/test_shard_gpt2.py             |  78 ++++----
 .../test_model/test_shard_llama.py            | 174 +++++++-----------
 10 files changed, 193 insertions(+), 231 deletions(-)
 delete mode 100644 colossalai/params.py

diff --git a/colossalai/params.py b/colossalai/params.py
deleted file mode 100644
index 4058ce430834..000000000000
--- a/colossalai/params.py
+++ /dev/null
@@ -1 +0,0 @@
-to_cast = []
diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index 66bcd0295fb6..a7ed61252631 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -55,7 +55,7 @@ def cast_from_fp8(inp: torch.Tensor, scale_inv: torch.Tensor, ret_type: torch.dt
     return ret.to(ret_type)
 
 
-def all_reduce_fp8(tensor: torch.Tensor, fp8_format="e4m3") -> None:
+def all_reduce_fp8(tensor: torch.Tensor, fp8_format="e4m3", group=None) -> None:
     r"""
     This is an in-place operation for compressed all_reduce using fp8.
     It works like dist.all_reduce but during communication the data is cast to fp8 format.
@@ -66,7 +66,7 @@ def all_reduce_fp8(tensor: torch.Tensor, fp8_format="e4m3") -> None:
         None
     """
 
-    world_size = dist.get_world_size()
+    world_size = dist.get_world_size(group=group)
     input_type = tensor.dtype
     input_shape = tensor.shape
     input_device = tensor.device
@@ -83,19 +83,19 @@ def all_reduce_fp8(tensor: torch.Tensor, fp8_format="e4m3") -> None:
         output_chunks = [torch.empty_like(input_chunks[-1]) for _ in range(world_size)]
     else:
         output_chunks = [torch.empty_like(input_chunks[0]) for _ in range(world_size)]
-    dist.all_to_all(output_chunks, input_chunks)
+    dist.all_to_all(output_chunks, input_chunks, group=group)
     scale_list = [torch.ones(1, dtype=scale.dtype, device=input_device) for _ in range(world_size)]
-    dist.all_gather(scale_list, scale)
+    dist.all_gather(scale_list, scale, group=group)
     summed_out = torch.zeros_like(output_chunks[0]).to(input_type)
     for scale, out in zip(scale_list, output_chunks):
         out = out.view(fp8_type)
         summed_out += cast_from_fp8(out, scale, input_type)
 
     summed_out_fp8, scale = cast_to_fp8(summed_out, fp8_format=fp8_format)
-    dist.all_gather(scale_list, scale)
+    dist.all_gather(scale_list, scale, group=group)
 
     tensor_list = list(torch.chunk(torch.empty(input_size, device=input_device, dtype=torch.uint8), world_size, dim=0))
-    dist.all_gather(tensor_list, summed_out_fp8.view(torch.uint8))
+    dist.all_gather(tensor_list, summed_out_fp8.view(torch.uint8), group=group)
     for i in range(world_size):
         tensor_list[i] = tensor_list[i].view(fp8_type).to(input_type) * scale_list[i]
     tensor_out = torch.cat(tensor_list, dim=0)
@@ -169,8 +169,8 @@ def cast_from_fp8_pipeline(inp: Any, del_metadata=True) -> None:
 
 def reduce_scatter_fp8(output: torch.Tensor, input_list, group, fp8_format="e4m3") -> None:
     r"""
-    This is an in-place operation for compressed all_reduce using fp8.
-    It works like dist.all_reduce but during communication the data is cast to fp8 format.
+    This is an in-place operation for compressed reduce_scatter using fp8.
+    It works like dist.reduce_scatter but during communication the data is cast to fp8 format.
 
     Args:
         tensor: torch.Tensor in fp32, fp16, bf16 datatype.
diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index 12ed1d409310..c1a04357e5d7 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -94,7 +94,7 @@ def backward(ctx, grad_output):
             grad_output = grad_output.view(-1, grad_output.shape[-1])
             total_input = total_input.view(-1, total_input.shape[-1])
 
-        if fp8_communication and ctx.async_grad_allreduce:
+        if ctx.async_grad_allreduce and fp8_communication:
             _reduce(grad_input, group=ctx.process_group, fp8_communication=fp8_communication)
         elif ctx.async_grad_allreduce:
             # Asynchronous all-reduce
@@ -117,12 +117,11 @@ class LinearWithAsyncCommunication(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, input_, weight, bias, process_group, async_grad_allreduce, fp8_communication=False):
+    def forward(ctx, input_, weight, bias, process_group, async_grad_allreduce):
         ctx.save_for_backward(input_, weight, bias)
         ctx.use_bias = bias is not None
         ctx.process_group = process_group
         ctx.async_grad_allreduce = async_grad_allreduce
-        ctx.fp8_communication = fp8_communication
         if bias is not None:
             output = F.linear(input_, weight, bias)
         else:
@@ -134,7 +133,6 @@ def forward(ctx, input_, weight, bias, process_group, async_grad_allreduce, fp8_
     def backward(ctx, grad_output):
         input, weight, bias = ctx.saved_tensors
         use_bias = ctx.use_bias
-        fp8_communication = ctx.fp8_communication
 
         # In order to be hooked into Gemini's '__torch_function__', adding a view operation to bias.
         if use_bias:
@@ -150,10 +148,7 @@ def backward(ctx, grad_output):
 
         if ctx.async_grad_allreduce:
             # Asynchronous all-reduce
-            if fp8_communication:
-                all_reduce_fp8(grad_input, group=ctx.process_group)
-            else:
-                handle = dist.all_reduce(grad_input, group=ctx.process_group, async_op=True)
+            handle = dist.all_reduce(grad_input, group=ctx.process_group, async_op=True)
             # Relay on CUDA_DEVICE_MAX_CONNECTIONS=1 to have
             # all-reduce scheduled first and have GPU resources allocated, CUDA_DEVICE_MAX_CONNECTIONS=1 is set in shardformer.py
 
@@ -172,7 +167,7 @@ def backward(ctx, grad_output):
 
         grad_bias = grad_output.sum(dim=0) if use_bias else None
 
-        if ctx.async_grad_allreduce and not fp8_communication:
+        if ctx.async_grad_allreduce:
             handle.wait()
 
         return grad_input, grad_weight, grad_bias, None, None, None, None
@@ -243,18 +238,16 @@ class _GatherForwardReduceScatterBackward(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, input_, process_group, dim, fp8_communication=False):
+    def forward(ctx, input_, process_group, dim):
         ctx.process_group = process_group
         ctx.dim = dim
-        ctx.fp8_communication = fp8_communication
 
-        return _gather(input_, dim, process_group, fp8_communication)
+        return _gather(input_, dim, process_group)
 
     @staticmethod
     def backward(ctx, grad_output):
         dim = ctx.dim
         process_group = ctx.process_group
-        fp8_communication = ctx.fp8_communication
         # do reduce-scatter
         new_shape = list(grad_output.shape)
         assert (
@@ -266,10 +259,7 @@ def backward(ctx, grad_output):
         ]
         output = torch.empty(new_shape, dtype=grad_output.dtype, device=grad_output.device)
 
-        if fp8_communication:
-            reduce_scatter_fp8(output, grad_list, group=process_group)
-        else:
-            dist.reduce_scatter(output, grad_list, group=process_group)
+        dist.reduce_scatter(output, grad_list, group=process_group)
 
         return output, None, None, None
 
@@ -576,7 +566,6 @@ def forward(ctx, input_, process_group, dim, fp8_communication=False):
         input_list = [item.contiguous() for item in torch.chunk(input_, dist.get_world_size(process_group), dim=dim)]
         output = torch.empty(new_shape, dtype=input_.dtype, device=input_.device)
         if fp8_communication:
-            # if False:
             reduce_scatter_fp8(output, input_list, group=process_group)
         else:
             dist.reduce_scatter(output, input_list, group=process_group)
@@ -588,8 +577,7 @@ def backward(ctx, grad_output):
         dim = ctx.dim
         process_group = ctx.process_group
         fp8_communication = ctx.fp8_communication
-
-        return _gather(grad_output, dim, process_group, fp8_communication), None, None, None
+        return _gather(grad_output, dim, process_group, fp8_communication=fp8_communication), None, None, None
 
 
 class _MatmulWithGatherForwardReduceScatterBackward(torch.autograd.Function):
@@ -793,12 +781,12 @@ class _GatherForwardSplitBackward(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, input_, dim, process_group, grad_scale=None, fp8_comm=False):
+    def forward(ctx, input_, dim, process_group, grad_scale=None, fp8_communication=False):
         ctx.process_group = process_group
         ctx.dim = dim
         ctx.grad_scale = grad_scale
 
-        return _gather(input_, dim, process_group, fp8_comm=fp8_comm, fp8_format="e4m3")
+        return _gather(input_, dim, process_group, fp8_communication=fp8_communication, fp8_format="e4m3")
 
     @staticmethod
     def backward(ctx, grad_output):
@@ -829,11 +817,23 @@ def forward(ctx, input_, process_group, scatter_dim, gather_dim, fp8_communicati
         # using all_to_all_single when batch size is 1
         if bsz == 1:
             return _all_to_all_single(
-                input_, world_size, process_group, scatter_dim, gather_dim, fp8_comm=fp8_comm, fp8_format="e5m2"
+                input_,
+                world_size,
+                process_group,
+                scatter_dim,
+                gather_dim,
+                fp8_communication=fp8_communication,
+                fp8_format="e5m2",
             )
         else:
             return _all_to_all(
-                input_, world_size, process_group, scatter_dim, gather_dim, fp8_comm=fp8_comm, fp8_format="e5m2"
+                input_,
+                world_size,
+                process_group,
+                scatter_dim,
+                gather_dim,
+                fp8_communication=fp8_communication,
+                fp8_format="e5m2",
             )
 
     @staticmethod
@@ -841,17 +841,29 @@ def backward(ctx, grad_output):
         process_group = ctx.process_group
         scatter_dim = ctx.gather_dim
         gather_dim = ctx.scatter_dim
-        ctx.fp8_communication
+        fp8_communication = ctx.fp8_communication
         world_size = dist.get_world_size(process_group)
         bsz, _, _ = grad_output.shape
 
         if bsz == 1:
             return_grad = _all_to_all_single(
-                grad_output, world_size, process_group, scatter_dim, gather_dim, fp8_comm=fp8_comm, fp8_format="e5m2"
+                grad_output,
+                world_size,
+                process_group,
+                scatter_dim,
+                gather_dim,
+                fp8_communication=fp8_communication,
+                fp8_format="e5m2",
             )
         else:
             return_grad = _all_to_all(
-                grad_output, world_size, process_group, scatter_dim, gather_dim, fp8_comm=fp8_comm, fp8_format="e5m2"
+                grad_output,
+                world_size,
+                process_group,
+                scatter_dim,
+                gather_dim,
+                fp8_communication=fp8_communication,
+                fp8_format="e5m2",
             )
 
         return (return_grad, None, None, None, None)
@@ -912,10 +924,7 @@ def _split(input_, dim=-1, process_group=None):
     return output
 
 
-from colossalai.params import to_cast
-
-
-def _gather(input_, dim=-1, process_group=None, fp8_comm=False, fp8_format="e4m3"):
+def _gather(input_, dim=-1, process_group=None, fp8_communication=False, fp8_format="e4m3"):
     # skip if only one rank involved
     world_size = dist.get_world_size(process_group)
     if world_size == 1:
@@ -926,13 +935,12 @@ def _gather(input_, dim=-1, process_group=None, fp8_comm=False, fp8_format="e4m3
 
     from colossalai.zero.low_level._utils import has_inf_or_nan
 
-    if fp8_comm:
+    if fp8_communication:
         # if False:
         if has_inf_or_nan(input_):
             print("input has nan")
             exit(0)
         input_type = input_.dtype
-        to_cast.append(input_)
         ret, scale = cast_to_fp8(input_, fp8_format="e5m2")
         if has_inf_or_nan(ret):
             import pdb
@@ -1012,8 +1020,8 @@ def _reduce_scatter(input_, dim=1, process_group=None):
     return output
 
 
-def _all_to_all(input_, world_size, group, scatter_dim, gather_dim, fp8_comm=False, fp8_format="e5m2"):
-    if fp8_comm:
+def _all_to_all(input_, world_size, group, scatter_dim, gather_dim, fp8_communication=False, fp8_format="e5m2"):
+    if fp8_communication:
         input_type = input_.dtype
         ret, scale = cast_to_fp8(input_, fp8_format=fp8_format)
         fp8_type = ret.dtype
@@ -1036,7 +1044,9 @@ def _all_to_all(input_, world_size, group, scatter_dim, gather_dim, fp8_comm=Fal
     return torch.cat(output_list, dim=gather_dim).contiguous()
 
 
-def _all_to_all_single(input_, seq_world_size, group, scatter_dim, gather_dim, fp8_comm=False, fp8_format="e5m2"):
+def _all_to_all_single(
+    input_, seq_world_size, group, scatter_dim, gather_dim, fp8_communication=False, fp8_format="e5m2"
+):
     inp_shape = list(input_.shape)
     inp_shape[scatter_dim] = inp_shape[scatter_dim] // seq_world_size
     if scatter_dim < 2:
@@ -1048,7 +1058,7 @@ def _all_to_all_single(input_, seq_world_size, group, scatter_dim, gather_dim, f
             .contiguous()
         )
 
-    if fp8_comm:
+    if fp8_communication:
         input_type = input_t.dtype
         ret, scale = cast_to_fp8(input_t, fp8_format=fp8_format)
         fp8_type = ret.dtype
@@ -1085,10 +1095,8 @@ def matmul_with_async_comm(input_, weight, bias, process_group, async_grad_allre
     )
 
 
-def linear_with_async_comm(input_, weight, bias, process_group, async_grad_allreduce, fp8_communication=False):
-    return LinearWithAsyncCommunication.apply(
-        input_, weight, bias, process_group, async_grad_allreduce, fp8_communication
-    )
+def linear_with_async_comm(input_, weight, bias, process_group, async_grad_allreduce):
+    return LinearWithAsyncCommunication.apply(input_, weight, bias, process_group, async_grad_allreduce)
 
 
 def linear_gather_forward_reducescatter_backward(
@@ -1099,8 +1107,8 @@ def linear_gather_forward_reducescatter_backward(
     )
 
 
-def gather_forward_reducescatter_backward(input_, process_group, dim, fp8_communication=False):
-    return _GatherForwardReduceScatterBackward.apply(input_, process_group, dim, fp8_communication)
+def gather_forward_reducescatter_backward(input_, process_group, dim):
+    return _GatherForwardReduceScatterBackward.apply(input_, process_group, dim)
 
 
 def reducescatter_forward_gather_backward(input_, process_group, dim, fp8_communication=False):
@@ -1132,8 +1140,8 @@ def reduce_forward(input_, process_group, fp8_communication=False):
 
 
 def reduce_backward(input_, process_group, fp8_communication=False):
-    return _ReduceBackward.apply(input_, process_group, fp8_communication=fp8_communication)
+    return _ReduceBackward.apply(input_, process_group, fp8_communication)
 
 
-def all_to_all_comm(input_, process_group=None, scatter_dim=2, gather_dim=1, fp8_comm=False):
-    return _AllToAll.apply(input_, process_group, scatter_dim, gather_dim, fp8_comm)
+def all_to_all_comm(input_, process_group=None, scatter_dim=2, gather_dim=1, fp8_communication=False):
+    return _AllToAll.apply(input_, process_group, scatter_dim, gather_dim, fp8_communication)
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index 05c9fe3bd9c7..26d1b6e4c0d2 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -510,9 +510,9 @@ def forward(
 
         # sp: all-to-all comminucation when introducing sequence parallel
         if sp_mode == "all_to_all":
-            query_states = all_to_all_comm(query_states, sp_group, fp8_comm=shard_config.fp8_communication)
-            key_states = all_to_all_comm(key_states, sp_group, fp8_comm=shard_config.fp8_communication)
-            value_states = all_to_all_comm(value_states, sp_group, fp8_comm=shard_config.fp8_communication)
+            query_states = all_to_all_comm(query_states, sp_group)
+            key_states = all_to_all_comm(key_states, sp_group)
+            value_states = all_to_all_comm(value_states, sp_group)
             bsz, q_len, _ = query_states.size()
 
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
@@ -660,11 +660,16 @@ def forward(
 
         if sp_mode in ["ring", "split_gather"]:
             inputs_embeds = split_forward_gather_backward(
-                inputs_embeds, 1, sp_group, fp8_comm=shard_config.fp8_communication
+                inputs_embeds,
+                1,
+                sp_group,
             )
         elif sp_mode == "all_to_all":
             inputs_embeds = split_forward_gather_backward(
-                inputs_embeds, 1, sp_group, 1 / sp_size, fp8_comm=shard_config.fp8_communication
+                inputs_embeds,
+                1,
+                sp_group,
+                1 / sp_size,
             )
         hidden_states = inputs_embeds
 
@@ -711,11 +716,16 @@ def forward(
 
         if sp_mode == "ring" or sp_mode == "split_gather":
             hidden_states = gather_forward_split_backward(
-                hidden_states, 1, sp_group, fp8_comm=shard_config.fp8_communication
+                hidden_states,
+                1,
+                sp_group,
             )
         elif sp_mode == "all_to_all":
             hidden_states = gather_forward_split_backward(
-                hidden_states, 1, sp_group, grad_scale=sp_size, fp8_comm=shard_config.fp8_communication
+                hidden_states,
+                1,
+                sp_group,
+                grad_scale=sp_size,
             )
 
         # add hidden states from the last decoder layer
diff --git a/colossalai/shardformer/policies/llama.py b/colossalai/shardformer/policies/llama.py
index be21b33e10a6..85ec6717d0d1 100644
--- a/colossalai/shardformer/policies/llama.py
+++ b/colossalai/shardformer/policies/llama.py
@@ -134,37 +134,37 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                     SubModuleReplacementDescription(
                         suffix="self_attn.q_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
+                        kwargs=dict(seq_parallel_mode=sp_mode),
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.k_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
+                        kwargs=dict(seq_parallel_mode=sp_mode),
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.v_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
+                        kwargs=dict(seq_parallel_mode=sp_mode),
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.o_proj",
                         target_module=Linear1D_Row,
-                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
+                        kwargs=dict(seq_parallel_mode=sp_mode),
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.gate_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
+                        kwargs=dict(seq_parallel_mode=sp_mode),
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.up_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
+                        kwargs=dict(seq_parallel_mode=sp_mode),
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.down_proj",
                         target_module=Linear1D_Row,
-                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
+                        kwargs=dict(seq_parallel_mode=sp_mode),
                     ),
                 ],
             )
diff --git a/examples/language/bert/finetune.py b/examples/language/bert/finetune.py
index d9246ff49a9f..8a59ab6838a6 100644
--- a/examples/language/bert/finetune.py
+++ b/examples/language/bert/finetune.py
@@ -224,10 +224,7 @@ def main():
         # modify the param accordingly for finetuning test cases
         plugin = HybridParallelPlugin(
             tp_size=1,
-            pp_size=1,
-            sp_size=2,
-            enable_sequence_parallelism=True,
-            sequence_parallelism_mode="all_to_all",
+            pp_size=2,
             num_microbatches=None,
             pp_style="interleaved",
             num_model_chunks=2,
diff --git a/examples/language/bert/test_ci.sh b/examples/language/bert/test_ci.sh
index de35e0fa5c53..fc4eacf6fe79 100755
--- a/examples/language/bert/test_ci.sh
+++ b/examples/language/bert/test_ci.sh
@@ -5,7 +5,7 @@ pip install -r requirements.txt
 
 FAIL_LIMIT=3
 
-for plugin in "hybrid_parallel"; do
+for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero" "hybrid_parallel"; do
     for i in $(seq 1 $FAIL_LIMIT); do
         torchrun --standalone --nproc_per_node 4 finetune.py --target_f1 0.86 --plugin $plugin --model_type "bert" && break
         echo "Failed $i times"
diff --git a/examples/language/gpt/hybridparallelism/finetune.py b/examples/language/gpt/hybridparallelism/finetune.py
index ae80ddad5884..a4b48fa05eb3 100644
--- a/examples/language/gpt/hybridparallelism/finetune.py
+++ b/examples/language/gpt/hybridparallelism/finetune.py
@@ -220,9 +220,9 @@ def main():
         plugin = HybridParallelPlugin(
             tp_size=2,
             pp_size=1,
-            sp_size=2,
-            sequence_parallelism_mode="split_gather",
-            enable_sequence_parallelism=True,
+            sp_size=1,
+            # sequence_parallelism_mode="split_gather",
+            # enable_sequence_parallelism=True,
             num_microbatches=None,
             microbatch_size=1,
             enable_all_optimization=True,
@@ -321,7 +321,3 @@ def _criterion(outputs, inputs):
 
 if __name__ == "__main__":
     main()
-    if dist.get_rank() == 0:
-        import pdb
-
-        pdb.set_trace()
diff --git a/tests/test_shardformer/test_model/test_shard_gpt2.py b/tests/test_shardformer/test_model/test_shard_gpt2.py
index 8fa0997d540c..f9e368c0ebf3 100644
--- a/tests/test_shardformer/test_model/test_shard_gpt2.py
+++ b/tests/test_shardformer/test_model/test_shard_gpt2.py
@@ -51,7 +51,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
         if test_config["precision"] == "fp32":
             atol, rtol = 1e-4, 1e-3
         else:
-            atol, rtol = 5e-2, 5e-2
+            atol, rtol = 5e-3, 5e-3
         col_layer_grads = get_grad_tensors_for_check(
             gpt2,
             sharded_gpt2,
@@ -97,7 +97,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
         if test_config["precision"] == "fp32":
             atol, rtol = 1e-5, 1e-3
         else:
-            atol, rtol = 5e-2, 5e-2
+            atol, rtol = 5e-3, 5e-3
 
         if org_model.__class__.__name__ == "GPT2Model":
             check_output_hidden_state(org_output, sharded_output, stage_manager, atol=atol, rtol=rtol)
@@ -131,47 +131,17 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
 @parameterize(
     "test_config",
     [
-        # {
-        #     "tp_size": 4,
-        #     "pp_size": 1,
-        #     "num_microbatches": 1,
-        #     "enable_sequence_parallelism": True,
-        #     "sequence_parallelism_mode": "ring",
-        #     "enable_flash_attention": False,
-        #     "use_lazy_init": True,
-        #     "precision": "fp32",
-        #     "initial_scale": 1,
-        # },
-        # {
-        #     "tp_size": 4,
-        #     "pp_size": 1,
-        #     "num_microbatches": 1,
-        #     "enable_sequence_parallelism": True,
-        #     "sequence_parallelism_mode": "split_gather",
-        #     "enable_flash_attention": False,
-        #     "use_lazy_init": True,
-        #     "precision": "fp16",
-        #     "initial_scale": 1,
-        # },
-        # {
-        #     "tp_size": 2,
-        #     "pp_size": 2,
-        #     "num_microbatches": 4,
-        #     "enable_all_optimization": True,
-        #     "use_lazy_init": True,
-        #     "precision": "fp16",
-        #     "initial_scale": 1,
-        # },
-        # {
-        #     "tp_size": 1,
-        #     "pp_size": 2,
-        #     "num_microbatches": 2,
-        #     "enable_all_optimization": True,
-        #     "use_lazy_init": True,
-        #     "zero_stage": 1,
-        #     "precision": "fp16",
-        #     "initial_scale": 1,
-        # },
+        {
+            "tp_size": 4,
+            "pp_size": 1,
+            "num_microbatches": 1,
+            "enable_sequence_parallelism": True,
+            "sequence_parallelism_mode": "ring",
+            "enable_flash_attention": False,
+            "use_lazy_init": True,
+            "precision": "fp32",
+            "initial_scale": 1,
+        },
         {
             "tp_size": 4,
             "pp_size": 1,
@@ -182,7 +152,25 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "use_lazy_init": True,
             "precision": "fp16",
             "initial_scale": 1,
-            "fp8_communication": True,
+        },
+        {
+            "tp_size": 2,
+            "pp_size": 2,
+            "num_microbatches": 4,
+            "enable_all_optimization": True,
+            "use_lazy_init": True,
+            "precision": "fp16",
+            "initial_scale": 1,
+        },
+        {
+            "tp_size": 1,
+            "pp_size": 2,
+            "num_microbatches": 2,
+            "enable_all_optimization": True,
+            "use_lazy_init": True,
+            "zero_stage": 1,
+            "precision": "fp16",
+            "initial_scale": 1,
         },
     ],
 )
@@ -284,4 +272,4 @@ def test_gpt2_3d():
 
 if __name__ == "__main__":
     test_gpt2()
-    # test_gpt2_3d()
+    test_gpt2_3d()
diff --git a/tests/test_shardformer/test_model/test_shard_llama.py b/tests/test_shardformer/test_model/test_shard_llama.py
index d32b0684eb96..8fe18f69bcd1 100644
--- a/tests/test_shardformer/test_model/test_shard_llama.py
+++ b/tests/test_shardformer/test_model/test_shard_llama.py
@@ -34,6 +34,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
     if enable_gradient_checkpointing:
         # org_model.gradient_checkpointing_enable()
         sharded_model.unwrap().gradient_checkpointing_enable()
+
     org_loss, org_output, sharded_loss, sharded_output = run_forward_backward_with_hybrid_plugin(
         org_model, sharded_model, sharded_optimizer, data_gen_fn, output_transform_fn, criterion, booster
     )
@@ -70,7 +71,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             )
             grad = grads[grad_index]
             sharded_grad = p1.grad.view(-1).chunk(dist.get_world_size())[dist.get_rank()]
-            assert_close(sharded_grad, grad[: sharded_grad.shape[0]], atol=5e-2, rtol=5e-2, check_dtype=False)
+            assert_close(sharded_grad, grad[: sharded_grad.shape[0]], atol=5e-3, rtol=5e-3, check_dtype=False)
 
     # Save gradient tensors for comparison between the original model and the sharded model before optimizer step.
     grads_to_check = {}
@@ -108,7 +109,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
         if test_config["precision"] == "fp32":
             atol, rtol = 1e-5, 1e-3
         else:
-            atol, rtol = 5e-2, 5e-2
+            atol, rtol = 5e-3, 5e-3
 
         if org_model.__class__.__name__ == "LlamaModel":
             check_output_hidden_state(org_output, sharded_output, stage_manager, atol=atol, rtol=rtol)
@@ -120,7 +121,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
         if test_config["precision"] == "fp32":
             atol, rtol = 1e-4, 1e-3
         else:
-            atol, rtol = 5e-2, 5e-2
+            atol, rtol = 5e-3, 5e-3
         try:
             check_weight(
                 llama_model,
@@ -145,141 +146,104 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
 @parameterize(
     "test_config",
     [
-        # {  # Test ring + Flash attention
-        #     "tp_size": 2,
-        #     "pp_size": 1,
-        #     "sp_size": 2,
-        #     "num_microbatches": 1,
-        #     "enable_sequence_parallelism": True,
-        #     "sequence_parallelism_mode": "ring",
-        #     "enable_flash_attention": True,
-        #     "use_lazy_init": True,
-        #     "zero_stage": 2,
-        #     "precision": "fp16",
-        #     "initial_scale": 1,
-        # },
-        # {  # Ulysess + Flash attention
-        #     "tp_size": 1,
-        #     "pp_size": 2,
-        #     "sp_size": 2,
-        #     "num_microbatches": 2,
-        #     "enable_sequence_parallelism": True,
-        #     "sequence_parallelism_mode": "all_to_all",
-        #     "enable_flash_attention": True,
-        #     "use_lazy_init": True,
-        #     "zero_stage": 1,
-        #     "precision": "fp16",
-        #     "initial_scale": 1,
-        # },
-        # {
-        #     "tp_size": 1,
-        #     "pp_size": 1,
-        #     "sp_size": 2,
-        #     "num_microbatches": 1,
-        #     "enable_sequence_parallelism": True,
-        #     "sequence_parallelism_mode": "all_to_all",
-        #     "use_lazy_init": True,
-        #     "zero_stage": 1,
-        #     "precision": "fp16",
-        #     "initial_scale": 1,
-        # },
-        # {
-        #     "tp_size": 4,
-        #     "pp_size": 1,
-        #     "num_microbatches": 1,
-        #     "enable_sequence_parallelism": True,
-        #     "sequence_parallelism_mode": "split_gather",
-        #     "enable_flash_attention": False,
-        #     "use_lazy_init": True,
-        #     "precision": "fp16",
-        #     "initial_scale": 1,
-        # },
-        # {
-        #     "tp_size": 2,
-        #     "pp_size": 2,
-        #     "num_microbatches": 2,
-        #     "enable_all_optimization": True,
-        #     "use_lazy_init": True,
-        #     "precision": "fp16",
-        #     "initial_scale": 1,
-        #     "enable_gradient_checkpointing": True,
-        #     "gradient_checkpoint_config": PipelineGradientCheckpointConfig(gradient_checkpointing_ratio=0.5),
-        # },
-        # {
-        #     "tp_size": 1,
-        #     "pp_size": 2,
-        #     "num_microbatches": 4,
-        #     "use_lazy_init": False,
-        #     "precision": "fp32",
-        #     "enable_gradient_checkpointing": True,
-        #     "gradient_checkpoint_config": PipelineGradientCheckpointConfig(num_ckpt_layers_per_stage=[4, 0]),
-        # },
-        # {
-        #     "tp_size": 2,
-        #     "pp_size": 1,
-        #     "enable_all_optimization": True,
-        #     "use_lazy_init": True,
-        #     "zero_stage": 2,
-        #     "precision": "fp16",
-        #     "initial_scale": 1,
-        # },
-        # {
-        #     "tp_size": 1,
-        #     "pp_size": 2,
-        #     "num_microbatches": 2,
-        #     "enable_all_optimization": True,
-        #     "use_lazy_init": True,
-        #     "zero_stage": 1,
-        #     "precision": "fp16",
-        #     "initial_scale": 1,
-        # },
-        {
+        {  # Test ring + Flash attention
             "tp_size": 2,
             "pp_size": 1,
             "sp_size": 2,
             "num_microbatches": 1,
             "enable_sequence_parallelism": True,
-            "sequence_parallelism_mode": "split_gather",
+            "sequence_parallelism_mode": "ring",
+            "enable_flash_attention": True,
+            "use_lazy_init": True,
+            "zero_stage": 2,
+            "precision": "fp16",
+            "initial_scale": 1,
+        },
+        {  # Ulysess + Flash attention
+            "tp_size": 1,
+            "pp_size": 2,
+            "sp_size": 2,
+            "num_microbatches": 2,
+            "enable_sequence_parallelism": True,
+            "sequence_parallelism_mode": "all_to_all",
+            "enable_flash_attention": True,
             "use_lazy_init": True,
             "zero_stage": 1,
             "precision": "fp16",
             "initial_scale": 1,
-            "fp8_communication": True,
         },
         {
-            "tp_size": 2,
+            "tp_size": 1,
             "pp_size": 1,
+            "sp_size": 2,
             "num_microbatches": 1,
-            "enable_sequence_parallelism": False,
+            "enable_sequence_parallelism": True,
+            "sequence_parallelism_mode": "all_to_all",
             "use_lazy_init": True,
             "zero_stage": 1,
             "precision": "fp16",
             "initial_scale": 1,
-            "fp8_communication": True,
         },
         {
-            "tp_size": 1,
+            "tp_size": 4,
             "pp_size": 1,
-            "sp_size": 2,
             "num_microbatches": 1,
             "enable_sequence_parallelism": True,
-            "sequence_parallelism_mode": "all_to_all",
+            "sequence_parallelism_mode": "split_gather",
+            "enable_flash_attention": False,
+            "use_lazy_init": True,
+            "precision": "fp16",
+            "initial_scale": 1,
+        },
+        {
+            "tp_size": 2,
+            "pp_size": 2,
+            "num_microbatches": 2,
+            "enable_all_optimization": True,
+            "use_lazy_init": True,
+            "precision": "fp16",
+            "initial_scale": 1,
+            "enable_gradient_checkpointing": True,
+            "gradient_checkpoint_config": PipelineGradientCheckpointConfig(gradient_checkpointing_ratio=0.5),
+        },
+        {
+            "tp_size": 1,
+            "pp_size": 2,
+            "num_microbatches": 4,
+            "use_lazy_init": False,
+            "precision": "fp32",
+            "enable_gradient_checkpointing": True,
+            "gradient_checkpoint_config": PipelineGradientCheckpointConfig(num_ckpt_layers_per_stage=[4, 0]),
+        },
+        {
+            "tp_size": 2,
+            "pp_size": 1,
+            "enable_all_optimization": True,
+            "use_lazy_init": True,
+            "zero_stage": 2,
+            "precision": "fp16",
+            "initial_scale": 1,
+        },
+        {
+            "tp_size": 1,
+            "pp_size": 2,
+            "num_microbatches": 2,
+            "enable_all_optimization": True,
             "use_lazy_init": True,
             "zero_stage": 1,
             "precision": "fp16",
             "initial_scale": 1,
-            "fp8_communication": True,
         },
     ],
 )
 def run_llama_test(test_config):
-    sub_model_zoo = model_zoo.get_sub_registry("transformers_llama_for_sequence_classification")
+    sub_model_zoo = model_zoo.get_sub_registry("transformers_llama")
 
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
         try:
             check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
         except Exception as e:
-            print(f"Failed config out: {test_config}")
+            print(f"Failed config: {test_config}")
             raise e
 
     clear_layout_converter()
@@ -327,7 +291,7 @@ def run_llama_test(test_config):
     ],
 )
 def run_llama_3d_test(test_config):
-    sub_model_zoo = model_zoo.get_sub_registry("transformers_llama_for_sequence_classification")
+    sub_model_zoo = model_zoo.get_sub_registry("transformers_llama")
 
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
         try:
@@ -369,4 +333,4 @@ def test_llama_3d():
 
 if __name__ == "__main__":
     test_llama()
-    # test_llama_3d()
+    test_llama_3d()

From 6a20f07b8090b2fe801eb0bc0b33b397beb6d1fd Mon Sep 17 00:00:00 2001
From: GuangyaoZhang <xjtu521@qq.com>
Date: Wed, 17 Jul 2024 05:33:38 +0000
Subject: [PATCH 010/167] remove all to all

---
 colossalai/quantization/fp8.py                |   4 +-
 colossalai/shardformer/layer/_operation.py    | 153 +++---------------
 colossalai/shardformer/layer/linear.py        |  18 +--
 colossalai/shardformer/modeling/llama.py      |  30 +---
 .../gpt/hybridparallelism/finetune.py         |   7 +-
 5 files changed, 36 insertions(+), 176 deletions(-)

diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index a7ed61252631..867de839e19d 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -55,7 +55,7 @@ def cast_from_fp8(inp: torch.Tensor, scale_inv: torch.Tensor, ret_type: torch.dt
     return ret.to(ret_type)
 
 
-def all_reduce_fp8(tensor: torch.Tensor, fp8_format="e4m3", group=None) -> None:
+def all_reduce_fp8(tensor: torch.Tensor, fp8_format="e5m2", group=None) -> None:
     r"""
     This is an in-place operation for compressed all_reduce using fp8.
     It works like dist.all_reduce but during communication the data is cast to fp8 format.
@@ -167,7 +167,7 @@ def cast_from_fp8_pipeline(inp: Any, del_metadata=True) -> None:
         del inp["fp8_scale"]
 
 
-def reduce_scatter_fp8(output: torch.Tensor, input_list, group, fp8_format="e4m3") -> None:
+def reduce_scatter_fp8(output: torch.Tensor, input_list, group, fp8_format="e5m2") -> None:
     r"""
     This is an in-place operation for compressed reduce_scatter using fp8.
     It works like dist.reduce_scatter but during communication the data is cast to fp8 format.
diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index c1a04357e5d7..604a154d0c0b 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -170,7 +170,7 @@ def backward(ctx, grad_output):
         if ctx.async_grad_allreduce:
             handle.wait()
 
-        return grad_input, grad_weight, grad_bias, None, None, None, None
+        return grad_input, grad_weight, grad_bias, None, None, None
 
 
 def _ring_as_gather(func, input_to_gather=None, input_local=None, process_group=None, gather_dim=1, keep_item=False):
@@ -261,7 +261,7 @@ def backward(ctx, grad_output):
 
         dist.reduce_scatter(output, grad_list, group=process_group)
 
-        return output, None, None, None
+        return output, None, None
 
 
 class _LinearWithGatherForwardReduceScatterBackward(torch.autograd.Function):
@@ -729,7 +729,7 @@ def backward(ctx, grad_output):
             grad_output = grad_output * ctx.grad_scale
 
         # to_cast.append(grad_output.cpu().detach().numpy())
-        return _gather(grad_output, ctx.dim, ctx.process_group, ctx.fp8_communication, "e4m3"), None, None, None, None
+        return _gather(grad_output, ctx.dim, ctx.process_group, ctx.fp8_communication), None, None, None, None
 
 
 class _ReduceForward(torch.autograd.Function):
@@ -786,7 +786,7 @@ def forward(ctx, input_, dim, process_group, grad_scale=None, fp8_communication=
         ctx.dim = dim
         ctx.grad_scale = grad_scale
 
-        return _gather(input_, dim, process_group, fp8_communication=fp8_communication, fp8_format="e4m3")
+        return _gather(input_, dim, process_group, fp8_communication=fp8_communication)
 
     @staticmethod
     def backward(ctx, grad_output):
@@ -806,67 +806,26 @@ class _AllToAll(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, input_, process_group, scatter_dim, gather_dim, fp8_communication):
+    def forward(ctx, input_, process_group, scatter_dim, gather_dim):
         ctx.process_group = process_group
         ctx.scatter_dim = scatter_dim
         ctx.gather_dim = gather_dim
-        ctx.fp8_communication = fp8_communication
         world_size = dist.get_world_size(process_group)
         bsz, _, _ = input_.shape
 
         # using all_to_all_single when batch size is 1
         if bsz == 1:
-            return _all_to_all_single(
-                input_,
-                world_size,
-                process_group,
-                scatter_dim,
-                gather_dim,
-                fp8_communication=fp8_communication,
-                fp8_format="e5m2",
-            )
+            return _all_to_all_single(input_, world_size, process_group, scatter_dim, gather_dim)
         else:
-            return _all_to_all(
-                input_,
-                world_size,
-                process_group,
-                scatter_dim,
-                gather_dim,
-                fp8_communication=fp8_communication,
-                fp8_format="e5m2",
-            )
+            return _all_to_all(input_, world_size, process_group, scatter_dim, gather_dim)
 
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(ctx, *grad_output):
         process_group = ctx.process_group
         scatter_dim = ctx.gather_dim
         gather_dim = ctx.scatter_dim
-        fp8_communication = ctx.fp8_communication
-        world_size = dist.get_world_size(process_group)
-        bsz, _, _ = grad_output.shape
-
-        if bsz == 1:
-            return_grad = _all_to_all_single(
-                grad_output,
-                world_size,
-                process_group,
-                scatter_dim,
-                gather_dim,
-                fp8_communication=fp8_communication,
-                fp8_format="e5m2",
-            )
-        else:
-            return_grad = _all_to_all(
-                grad_output,
-                world_size,
-                process_group,
-                scatter_dim,
-                gather_dim,
-                fp8_communication=fp8_communication,
-                fp8_format="e5m2",
-            )
-
-        return (return_grad, None, None, None, None)
+        return_grad = _AllToAll.apply(*grad_output, process_group, scatter_dim, gather_dim)
+        return (return_grad, None, None, None)
 
 
 class HookParameter(torch.autograd.Function):
@@ -924,41 +883,20 @@ def _split(input_, dim=-1, process_group=None):
     return output
 
 
-def _gather(input_, dim=-1, process_group=None, fp8_communication=False, fp8_format="e4m3"):
+def _gather(input_, dim=-1, process_group=None, fp8_communication=False, fp8_format="e5m2"):
     # skip if only one rank involved
     world_size = dist.get_world_size(process_group)
     if world_size == 1:
         return input_
 
-    # all gather
-    import torch.distributed as dista
-
-    from colossalai.zero.low_level._utils import has_inf_or_nan
-
     if fp8_communication:
-        # if False:
-        if has_inf_or_nan(input_):
-            print("input has nan")
-            exit(0)
         input_type = input_.dtype
-        ret, scale = cast_to_fp8(input_, fp8_format="e5m2")
-        if has_inf_or_nan(ret):
-            import pdb
-
-            pdb.set_trace()
-            print("cast has nan")
-            # exit(0)
-        dista.barrier()
+        ret, scale = cast_to_fp8(input_, fp8_format=fp8_format)
         fp8_type = ret.dtype
         input_ = ret.view(torch.uint8)
         input_ = input_.contiguous()
         tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
         scale = torch.tensor(scale, dtype=torch.float32).to(input_.device)
-        # import torch.distributed as dista
-        # if dista.get_rank()==0:
-        #     import pdb
-        #     pdb.set_trace()
-        # dista.barrier()
         scale_list = [torch.ones(1, dtype=torch.float32, device=input_.device) for _ in range(world_size)]
 
         scale = torch.tensor(scale).to(input_.device)
@@ -969,24 +907,10 @@ def _gather(input_, dim=-1, process_group=None, fp8_communication=False, fp8_for
         for output, scale in zip(tensor_list, scale_list):
             output = output.view(fp8_type)
             output = cast_from_fp8(output, scale, input_type)
-            if has_inf_or_nan(output) and dista.get_rank() == 0:
-                print("casted_output has nan")
-                import pdb
-
-                pdb.set_trace()
-            dista.barrier()
-
             cast_tensor_list.append(output)
 
         output = torch.cat(cast_tensor_list, dim=dim).contiguous()
 
-        if has_inf_or_nan(output):
-            print("output has nan")
-            exit(0)
-            # import pdb
-            # pdb.set_trace()
-        dista.barrier()
-
     else:
         input_ = input_.contiguous()
         tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
@@ -1020,33 +944,14 @@ def _reduce_scatter(input_, dim=1, process_group=None):
     return output
 
 
-def _all_to_all(input_, world_size, group, scatter_dim, gather_dim, fp8_communication=False, fp8_format="e5m2"):
-    if fp8_communication:
-        input_type = input_.dtype
-        ret, scale = cast_to_fp8(input_, fp8_format=fp8_format)
-        fp8_type = ret.dtype
-        input_ = ret.view(torch.uint8)
-        input_list = [t.contiguous() for t in torch.tensor_split(input_, world_size, scatter_dim)]
-        output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)]
-        scale_list = [torch.ones(1, dtype=scale.dtype, device=input_.device) for _ in range(world_size)]
-        dist.all_to_all(output_list, input_list, group=group)
-        dist.all_gather(scale_list, scale, group=group)
-        cast_tensor_list = []
-        for output, scale in zip(output_list, scale_list):
-            output = output.view(fp8_type)
-            output = cast_from_fp8(output, scale, input_type)
-            cast_tensor_list.append(output)
-        output_list = cast_tensor_list
-    else:
-        input_list = [t.contiguous() for t in torch.tensor_split(input_, world_size, scatter_dim)]
-        output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)]
-        dist.all_to_all(output_list, input_list, group=group)
+def _all_to_all(input_, world_size, group, scatter_dim, gather_dim):
+    input_list = [t.contiguous() for t in torch.tensor_split(input_, world_size, scatter_dim)]
+    output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)]
+    dist.all_to_all(output_list, input_list, group=group)
     return torch.cat(output_list, dim=gather_dim).contiguous()
 
 
-def _all_to_all_single(
-    input_, seq_world_size, group, scatter_dim, gather_dim, fp8_communication=False, fp8_format="e5m2"
-):
+def _all_to_all_single(input_, seq_world_size, group, scatter_dim, gather_dim):
     inp_shape = list(input_.shape)
     inp_shape[scatter_dim] = inp_shape[scatter_dim] // seq_world_size
     if scatter_dim < 2:
@@ -1058,24 +963,8 @@ def _all_to_all_single(
             .contiguous()
         )
 
-    if fp8_communication:
-        input_type = input_t.dtype
-        ret, scale = cast_to_fp8(input_t, fp8_format=fp8_format)
-        fp8_type = ret.dtype
-        input_t = ret.view(torch.uint8)
-        output = torch.empty_like(input_t)
-        scale_list = [torch.ones(1, dtype=scale.dtype, device=input_.device) for _ in range(seq_world_size)]
-        dist.all_to_all_single(output, input_t, group=group)
-        dist.all_gather(scale_list, scale, group=group)
-        cast_tensor_list = []
-        for output_part, scale in zip(output, scale_list):
-            output_part = output_part.view(fp8_type)
-            output_part = cast_from_fp8(output_part, scale, input_type)
-            cast_tensor_list.append(output_part)
-        output = torch.stack(cast_tensor_list, dim=0)
-    else:
-        output = torch.empty_like(input_t)
-        dist.all_to_all_single(output, input_t, group=group)
+    output = torch.empty_like(input_t)
+    dist.all_to_all_single(output, input_t, group=group)
 
     if scatter_dim < 2:
         output = output.transpose(0, 1).contiguous()
@@ -1143,5 +1032,5 @@ def reduce_backward(input_, process_group, fp8_communication=False):
     return _ReduceBackward.apply(input_, process_group, fp8_communication)
 
 
-def all_to_all_comm(input_, process_group=None, scatter_dim=2, gather_dim=1, fp8_communication=False):
-    return _AllToAll.apply(input_, process_group, scatter_dim, gather_dim, fp8_communication)
+def all_to_all_comm(input_, process_group=None, scatter_dim=2, gather_dim=1):
+    return _AllToAll.apply(input_, process_group, scatter_dim, gather_dim)
diff --git a/colossalai/shardformer/layer/linear.py b/colossalai/shardformer/layer/linear.py
index af25c398b994..37c7542416f6 100644
--- a/colossalai/shardformer/layer/linear.py
+++ b/colossalai/shardformer/layer/linear.py
@@ -84,7 +84,6 @@ def __init__(
         bias_: Optional[Parameter] = None,
         weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
         bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
-        fp8_communication: bool = False,
         **kwargs,
     ):
         super().__init__(weight=weight, bias_=bias_, **kwargs)
@@ -99,7 +98,6 @@ def __init__(
         self.skip_bias_add = skip_bias_add
         self.device = device
         self.process_group = process_group
-        self.fp8_communication = fp8_communication
 
         if skip_bias_add and not bias:
             raise ValueError("cannot skip bias addition if bias is None")
@@ -203,12 +201,10 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
         bias = self.bias if not self.skip_bias_add else None
 
         if self.seq_parallel_mode is None:
-            output_parallel = linear_with_async_comm(
-                input_parallel, self.weight, bias, self.process_group, True, fp8_communication=self.fp8_communication
-            )
+            output_parallel = linear_with_async_comm(input_parallel, self.weight, bias, self.process_group, True)
         elif self.seq_parallel_mode == "split_gather":
             input_parallel = gather_forward_reducescatter_backward(
-                input_parallel, self.process_group, self.seq_parallel_dim, fp8_communication=self.fp8_communication
+                input_parallel, self.process_group, self.seq_parallel_dim
             )
             output_parallel = linear_with_async_comm(input_parallel, self.weight, bias, self.process_group, False)
         elif self.seq_parallel_mode == "ring":
@@ -268,7 +264,6 @@ def __init__(
         weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
         bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
         stream_chunk_num: int = 1,
-        fp8_communication: bool = False,
     ):
         super().__init__()
 
@@ -283,7 +278,6 @@ def __init__(
         self.seq_parallel_mode = seq_parallel_mode
         self.seq_parallel_dim = seq_parallel_dim
         self.num_partitions = dist.get_world_size(self.process_group)
-        self.fp8_communication = fp8_communication
 
         if skip_bias_add and not bias:
             raise ValueError("cannot skip bias addition if bias is None")
@@ -404,9 +398,7 @@ def forward(self, input_: Tensor) -> Tensor:
             ), "Invalid shapes in Linear1D_Row forward: input={}, weight={}. Expected last dim of input {}.".format(
                 input_.shape, self.weight.shape, self.weight.shape[-1] * self.num_partitions
             )
-            input_ = split_forward_gather_backward(
-                input_, dim=-1, process_group=self.process_group, fp8_comm=self.fp8_communication
-            )
+            input_ = split_forward_gather_backward(input_, dim=-1, process_group=self.process_group)
 
         if self.stream_chunk_num > 1:
             if self.training:
@@ -426,11 +418,11 @@ def forward(self, input_: Tensor) -> Tensor:
         else:
             if self.seq_parallel_mode is None:
                 output_parallel = linear_with_async_comm(input_, self.weight, None, self.process_group, False)
-                output = reduce_forward(output_parallel, self.process_group, fp8_communication=self.fp8_communication)
+                output = reduce_forward(output_parallel, self.process_group)
             elif self.seq_parallel_mode == "split_gather":
                 output_parallel = linear_with_async_comm(input_, self.weight, None, self.process_group, False)
                 output = reducescatter_forward_gather_backward(
-                    output_parallel, self.process_group, self.seq_parallel_dim, fp8_communication=self.fp8_communication
+                    output_parallel, self.process_group, self.seq_parallel_dim
                 )
             elif self.seq_parallel_mode == "ring":
                 output = linear_reducescatter_forward_gather_backward(
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index 26d1b6e4c0d2..bf5ce45a8342 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -460,7 +460,7 @@ def llama_for_sequence_classification_forward(
             return {"hidden_states": hidden_states}
 
 
-def get_llama_flash_attention_forward(shard_config: ShardConfig, sp_mode=None, sp_size=None, sp_group=None):
+def get_llama_flash_attention_forward(shard_config, sp_mode=None, sp_size=None, sp_group=None):
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -592,7 +592,7 @@ def forward(
     return forward
 
 
-def get_llama_flash_attention_model_forward(shard_config: ShardConfig, sp_mode=None, sp_size=None, sp_group=None):
+def get_llama_flash_attention_model_forward(shard_config, sp_mode=None, sp_size=None, sp_group=None):
     logger = logging.get_logger(__name__)
 
     def forward(
@@ -659,18 +659,9 @@ def forward(
             attention_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
 
         if sp_mode in ["ring", "split_gather"]:
-            inputs_embeds = split_forward_gather_backward(
-                inputs_embeds,
-                1,
-                sp_group,
-            )
+            inputs_embeds = split_forward_gather_backward(inputs_embeds, 1, sp_group)
         elif sp_mode == "all_to_all":
-            inputs_embeds = split_forward_gather_backward(
-                inputs_embeds,
-                1,
-                sp_group,
-                1 / sp_size,
-            )
+            inputs_embeds = split_forward_gather_backward(inputs_embeds, 1, sp_group, 1 / sp_size)
         hidden_states = inputs_embeds
 
         # decoder layers
@@ -715,18 +706,9 @@ def forward(
         hidden_states = self.norm(hidden_states)
 
         if sp_mode == "ring" or sp_mode == "split_gather":
-            hidden_states = gather_forward_split_backward(
-                hidden_states,
-                1,
-                sp_group,
-            )
+            hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group)
         elif sp_mode == "all_to_all":
-            hidden_states = gather_forward_split_backward(
-                hidden_states,
-                1,
-                sp_group,
-                grad_scale=sp_size,
-            )
+            hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group, grad_scale=sp_size)
 
         # add hidden states from the last decoder layer
         if output_hidden_states:
diff --git a/examples/language/gpt/hybridparallelism/finetune.py b/examples/language/gpt/hybridparallelism/finetune.py
index a4b48fa05eb3..9b3a101609dc 100644
--- a/examples/language/gpt/hybridparallelism/finetune.py
+++ b/examples/language/gpt/hybridparallelism/finetune.py
@@ -218,11 +218,8 @@ def main():
     elif args.plugin == "hybrid_parallel":
         # modify the param accordingly for finetuning test cases
         plugin = HybridParallelPlugin(
-            tp_size=2,
-            pp_size=1,
-            sp_size=1,
-            # sequence_parallelism_mode="split_gather",
-            # enable_sequence_parallelism=True,
+            tp_size=1,
+            pp_size=2,
             num_microbatches=None,
             microbatch_size=1,
             enable_all_optimization=True,

From 5b969fd831050a79edf97efea16653021edd90e8 Mon Sep 17 00:00:00 2001
From: GuangyaoZhang <xjtu521@qq.com>
Date: Thu, 18 Jul 2024 07:16:36 +0000
Subject: [PATCH 011/167] fix shardformer fp8 communication training
 degradation

---
 colossalai/shardformer/layer/_operation.py | 34 ++++++++++++++--------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index 604a154d0c0b..bbf1c862d820 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -95,7 +95,7 @@ def backward(ctx, grad_output):
             total_input = total_input.view(-1, total_input.shape[-1])
 
         if ctx.async_grad_allreduce and fp8_communication:
-            _reduce(grad_input, group=ctx.process_group, fp8_communication=fp8_communication)
+            _reduce(grad_input, group=ctx.process_group, fp8_communication=fp8_communication, fp8_format="e5m2")
         elif ctx.async_grad_allreduce:
             # Asynchronous all-reduce
             handle = dist.all_reduce(grad_input, group=ctx.process_group, async_op=True)
@@ -566,7 +566,7 @@ def forward(ctx, input_, process_group, dim, fp8_communication=False):
         input_list = [item.contiguous() for item in torch.chunk(input_, dist.get_world_size(process_group), dim=dim)]
         output = torch.empty(new_shape, dtype=input_.dtype, device=input_.device)
         if fp8_communication:
-            reduce_scatter_fp8(output, input_list, group=process_group)
+            reduce_scatter_fp8(output, input_list, group=process_group, fp8_format="e4m3")
         else:
             dist.reduce_scatter(output, input_list, group=process_group)
 
@@ -577,7 +577,12 @@ def backward(ctx, grad_output):
         dim = ctx.dim
         process_group = ctx.process_group
         fp8_communication = ctx.fp8_communication
-        return _gather(grad_output, dim, process_group, fp8_communication=fp8_communication), None, None, None
+        return (
+            _gather(grad_output, dim, process_group, fp8_communication=fp8_communication, fp8_format="e5m2"),
+            None,
+            None,
+            None,
+        )
 
 
 class _MatmulWithGatherForwardReduceScatterBackward(torch.autograd.Function):
@@ -618,7 +623,7 @@ def forward(
             )
 
         else:
-            input_parallel = _gather(input_, dim, process_group, fp8_communication)
+            input_parallel = _gather(input_, dim, process_group, fp8_communication, fp8_format="e4m3")
 
             output = torch.matmul(input_parallel, weight)
 
@@ -641,7 +646,7 @@ def backward(ctx, grad_output):
             bias = bias.view(bias.shape)
 
         if not overlap:
-            input_parallel = _gather(input_, dim, process_group, fp8_communication)
+            input_parallel = _gather(input_, dim, process_group, fp8_communication, fp8_format="e5m2")
 
             total_input = input_parallel
             grad_input = grad_output.matmul(weight.T)
@@ -728,8 +733,13 @@ def backward(ctx, grad_output):
         if ctx.grad_scale is not None:
             grad_output = grad_output * ctx.grad_scale
 
-        # to_cast.append(grad_output.cpu().detach().numpy())
-        return _gather(grad_output, ctx.dim, ctx.process_group, ctx.fp8_communication), None, None, None, None
+        return (
+            _gather(grad_output, ctx.dim, ctx.process_group, ctx.fp8_communication, fp8_format="e5m2"),
+            None,
+            None,
+            None,
+            None,
+        )
 
 
 class _ReduceForward(torch.autograd.Function):
@@ -743,7 +753,7 @@ class _ReduceForward(torch.autograd.Function):
 
     @staticmethod
     def forward(ctx, input_, process_group, fp8_communication=False):
-        return _reduce(input_, process_group, fp8_communication)
+        return _reduce(input_, process_group, fp8_communication, fp8_format="e4m3")
 
     @staticmethod
     def backward(ctx, grad_output):
@@ -768,7 +778,7 @@ def forward(ctx, input_, process_group, fp8_communication=False):
     @staticmethod
     def backward(ctx, grad_output):
         fp8_communication = ctx.fp8_communication
-        return _reduce(grad_output, ctx.process_group, fp8_communication), None, None
+        return _reduce(grad_output, ctx.process_group, fp8_communication, fp8_format="e5m2"), None, None
 
 
 class _GatherForwardSplitBackward(torch.autograd.Function):
@@ -786,7 +796,7 @@ def forward(ctx, input_, dim, process_group, grad_scale=None, fp8_communication=
         ctx.dim = dim
         ctx.grad_scale = grad_scale
 
-        return _gather(input_, dim, process_group, fp8_communication=fp8_communication)
+        return _gather(input_, dim, process_group, fp8_communication=fp8_communication, fp8_format="e4m3")
 
     @staticmethod
     def backward(ctx, grad_output):
@@ -851,13 +861,13 @@ def hook_parameter_in_backward(input, weight=None, bias=None):
     return HookParameter.apply(input, weight, bias)
 
 
-def _reduce(input_, process_group, fp8_communication=False):
+def _reduce(input_, process_group, fp8_communication=False, fp8_format="e5m2"):
     # skip if only one rank involved
     if dist.get_world_size(process_group) == 1:
         return input_
     else:
         if fp8_communication:
-            all_reduce_fp8(input_, group=process_group)
+            all_reduce_fp8(input_, group=process_group, fp8_format=fp8_format)
         else:
             dist.all_reduce(input_, group=process_group)
         return input_

From 5fd0592767c1dcdf88f89d2c37cf399acb52c2b9 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Wed, 24 Jul 2024 16:55:20 +0800
Subject: [PATCH 012/167] [fp8] support all-gather flat tensor (#5932)

---
 colossalai/quantization/fp8.py       | 76 ++++++++++++++++++++++++++++
 tests/test_fp8/test_fp8_allgather.py | 40 +++++++++++++++
 2 files changed, 116 insertions(+)
 create mode 100644 tests/test_fp8/test_fp8_allgather.py

diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index 867de839e19d..fe5bd5744e69 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -1,5 +1,6 @@
 from typing import Any
 
+import numpy as np
 import torch
 import torch.distributed as dist
 
@@ -202,3 +203,78 @@ def reduce_scatter_fp8(output: torch.Tensor, input_list, group, fp8_format="e5m2
         out = out.view(fp8_type)
         summed_out += cast_from_fp8(out, scale, input_type)
     output.data = summed_out
+
+
+def split_chunk_by_channel(
+    chunk: torch.Tensor, channel_size: int, num_channels: int, rank: int = 0, world_size: int = 1
+):
+    offset = chunk.numel() * rank
+    end = offset + chunk.numel()
+    break_points = [x for x in range(0, channel_size * num_channels + 1, channel_size) if offset <= x <= end]
+    if len(break_points) == 0 or break_points[0] > offset:
+        break_points.insert(0, offset)
+    if break_points[-1] < end:
+        break_points.append(end)
+    sizes = [b - a for a, b in zip(break_points[:-1], break_points[1:])]
+    return chunk.split(sizes)
+
+
+def all_gather_into_tensor_flat_fp8(
+    output_tensor: torch.Tensor,
+    input_tensor: torch.Tensor,
+    output_shape: torch.Size,
+    group: dist.ProcessGroup,
+    fp8_format: str = "e4m3",
+):
+    """all gather into tensor in fp8 format
+
+    Args:
+        output_tensor (torch.Tensor): output tensor, which is flattened
+        input_tensor (torch.Tensor): input tensor, which is flattened
+        group (dist.ProcessGroup): process group
+        fp8_format (str, optional): fp8 format, e4m3 or e5m2. Defaults to "e4m3".
+    """
+    assert input_tensor.dim() == 1 and output_tensor.dim() == 1, "input/output tensor should be flattened"
+    world_size = dist.get_world_size(group)
+    assert (
+        output_tensor.numel() == input_tensor.numel() * world_size
+    ), "output tensor size should be world_size times of input tensor size"
+
+    input_type = output_tensor.dtype
+
+    fp8_type = torch.float8_e4m3fn if fp8_format == "e4m3" else torch.float8_e5m2
+    fp8_max = torch.finfo(fp8_type).max
+
+    if len(output_shape) == 2:
+        per_channel_max = torch.zeros(output_shape[0], device=output_tensor.device, dtype=torch.float)
+        num_channels, channel_size = output_shape
+        rank = dist.get_rank(group)
+        channel_start_idx = (input_tensor.numel() * rank) // channel_size
+        per_channel_splits = split_chunk_by_channel(input_tensor, channel_size, num_channels, rank, world_size)
+        for i, per_channel_split in enumerate(per_channel_splits):
+            idx = i + channel_start_idx
+            if idx < num_channels:
+                per_channel_max[idx] = per_channel_split.abs().max().float()
+        dist.all_reduce(per_channel_max, op=dist.ReduceOp.MAX, group=group)
+        per_channel_max = torch.where(per_channel_max > 0, per_channel_max, 1.0)
+        scale = fp8_max / per_channel_max
+        fp8_input = input_tensor.float()
+        fp8_per_channel_splits = split_chunk_by_channel(fp8_input, channel_size, num_channels, rank, world_size)
+        for i, per_channel_split in enumerate(fp8_per_channel_splits):
+            idx = i + channel_start_idx
+            if idx < num_channels:
+                per_channel_split.mul_(scale[idx])
+        fp8_input = fp8_input.to(fp8_type)
+    else:
+        per_tensor_max = input_tensor.abs().max().float()
+        dist.all_reduce(per_tensor_max, op=dist.ReduceOp.MAX, group=group)
+        per_tensor_max = torch.where(per_tensor_max > 0, per_tensor_max, 1.0)
+        scale = fp8_max / per_tensor_max
+        fp8_input = (scale * input_tensor.float()).to(fp8_type)
+    scale_inv = 1.0 / scale
+    buffer = torch.empty_like(output_tensor, dtype=fp8_type)
+    dist.all_gather_into_tensor(buffer.view(torch.uint8), fp8_input.view(torch.uint8), group=group)
+    numel = np.prod(output_shape)
+    valid_buffer = buffer[:numel].reshape(output_shape)
+    valid_buffer = cast_from_fp8(valid_buffer, scale_inv, input_type)
+    output_tensor[:numel].copy_(valid_buffer.view(-1))
diff --git a/tests/test_fp8/test_fp8_allgather.py b/tests/test_fp8/test_fp8_allgather.py
new file mode 100644
index 000000000000..1a4c8511a843
--- /dev/null
+++ b/tests/test_fp8/test_fp8_allgather.py
@@ -0,0 +1,40 @@
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch.distributed.distributed_c10d import _get_default_group
+from torch.testing import assert_close
+
+from colossalai import launch
+from colossalai.accelerator import get_accelerator
+from colossalai.quantization.fp8 import all_gather_into_tensor_flat_fp8
+from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+
+
+@parameterize("shape", [(3, 7), (2, 1), (1, 2), (2, 2), (4, 2), (5,), (4,), (2,)])
+@parameterize("dtype", [torch.bfloat16, torch.float16])
+def check_4gpu(shape, dtype):
+    world_size = dist.get_world_size()
+    rank = dist.get_rank()
+    x = torch.rand(shape, dtype=dtype, device=get_accelerator().get_current_device())
+    flat_padded_x = x.view(-1)
+    if flat_padded_x.size(0) % world_size != 0:
+        pad_size = world_size - flat_padded_x.size(0) % world_size
+        flat_padded_x = F.pad(flat_padded_x, (0, pad_size))
+    output = torch.empty_like(flat_padded_x)
+    chunk = flat_padded_x.chunk(world_size)[rank].clone()
+    all_gather_into_tensor_flat_fp8(output, chunk, x.shape, group=_get_default_group())
+    assert_close(output[: x.numel()], x.view(-1), rtol=0.1, atol=0.1)
+
+
+def run_dist(rank, world_size, port):
+    launch(rank=rank, world_size=world_size, port=port, host="localhost")
+    check_4gpu()
+
+
+@rerun_if_address_is_in_use()
+def test_all_gather():
+    spawn(run_dist, 4)
+
+
+if __name__ == "__main__":
+    test_all_gather()

From ae486ce0053ca71750229cd55b14d92f0f0484e2 Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Fri, 2 Aug 2024 11:12:12 +0800
Subject: [PATCH 013/167] [fp8] add fp8 comm for low level zero

---
 .../booster/plugin/low_level_zero_plugin.py   |  2 ++
 .../low_level/bookkeeping/tensor_bucket.py    | 13 +++++++---
 colossalai/zero/low_level/low_level_optim.py  | 26 +++++++++++++++----
 3 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py
index 7b5aec2aa405..bfb8930bb99c 100644
--- a/colossalai/booster/plugin/low_level_zero_plugin.py
+++ b/colossalai/booster/plugin/low_level_zero_plugin.py
@@ -293,6 +293,7 @@ def __init__(
         cpu_offload: bool = False,
         master_weights: bool = True,
         verbose: bool = False,
+        fp8_communication: bool = False,
     ) -> None:
         super().__init__()
         assert stage in (1, 2), f"LowLevelZeroPlugin only supports stage 1/2 training"
@@ -315,6 +316,7 @@ def __init__(
             partition_grad=(stage == 2),
             cpu_offload=cpu_offload,
             master_weights=master_weights,
+            fp8_communication=fp8_communication,
         )
         self.lora_enabled = False
         self.verbose = verbose
diff --git a/colossalai/zero/low_level/bookkeeping/tensor_bucket.py b/colossalai/zero/low_level/bookkeeping/tensor_bucket.py
index 5b09019b9169..d5fd2fe51662 100644
--- a/colossalai/zero/low_level/bookkeeping/tensor_bucket.py
+++ b/colossalai/zero/low_level/bookkeeping/tensor_bucket.py
@@ -4,6 +4,8 @@
 import torch.distributed as dist
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 
+from colossalai.quantization.fp8 import all_gather_into_tensor_flat_fp8
+
 
 class TensorBucket:
     def __init__(self, size):
@@ -61,11 +63,14 @@ def unflatten_and_copy(self, flat_tensor):
         for old, new in zip(self._bucket, unflattened_tensor_list):
             old.copy_(new)
 
-    def all_gather(self, group=None):
+    def all_gather(self, group=None, fp8_communication: bool = False):
         flat = self.flatten()
-        buffers = [torch.empty_like(flat) for _ in range(dist.get_world_size(group))]
-        dist.all_gather(buffers, flat, group=group)
-        unflat_buffers = [self.unflatten(buffer) for buffer in buffers]
+        buffer = torch.empty(flat.numel() * dist.get_world_size(group), device=flat.device, dtype=flat.dtype)
+        if fp8_communication:
+            all_gather_into_tensor_flat_fp8(buffer, flat, output_shape=buffer.shape, group=group)
+        else:
+            dist.all_gather_into_tensor(buffer, flat, group=group)
+        unflat_buffers = [self.unflatten(buffer) for buffer in buffer.chunk(dist.get_world_size(group))]
         # transpose the list of list
         unflat_buffers = list(map(list, zip(*unflat_buffers)))
         for unflat_shards, tensor in zip(unflat_buffers, self._bucket):
diff --git a/colossalai/zero/low_level/low_level_optim.py b/colossalai/zero/low_level/low_level_optim.py
index bdc91b51fa4a..1353071c59df 100644
--- a/colossalai/zero/low_level/low_level_optim.py
+++ b/colossalai/zero/low_level/low_level_optim.py
@@ -20,6 +20,7 @@
 )
 from colossalai.interface import OptimizerWrapper
 from colossalai.logging import get_dist_logger
+from colossalai.quantization.fp8 import all_gather_into_tensor_flat_fp8, all_reduce_fp8, reduce_scatter_fp8
 
 from ._utils import calculate_global_norm_from_list, has_inf_or_nan, release_param_grad, sync_tensor
 from .bookkeeping import BucketStore, GradientStore, TensorBucket
@@ -83,6 +84,7 @@ def __init__(
         dp_process_group: Optional[ProcessGroup] = None,
         forced_dtype: Optional[torch.dtype] = None,
         master_weights: bool = True,  # master weights
+        fp8_communication: bool = False,
     ):
         super(LowLevelZeroOptimizer, self).__init__(optim=optimizer)
 
@@ -123,6 +125,7 @@ def __init__(
         self._overlap_communication = overlap_communication
         self._reduce_bucket_size = reduce_bucket_size
         self._communication_dtype = communication_dtype
+        self._fp8_communication = fp8_communication
 
         # gradient clipping
         self._clip_grad_norm = clip_grad_norm
@@ -323,7 +326,10 @@ def _run_reduction(self):
                     flat_grads = flat_grads.to(self._communication_dtype)
 
                 if not self._partition_grads:
-                    dist.all_reduce(flat_grads, group=bucket_store.torch_pg)
+                    if self._fp8_communication:
+                        all_reduce_fp8(flat_grads, group=bucket_store.torch_pg)
+                    else:
+                        dist.all_reduce(flat_grads, group=bucket_store.torch_pg)
                     if flat_grads.dtype != grad_dtype:
                         flat_grads = flat_grads.to(grad_dtype)
 
@@ -333,7 +339,14 @@ def _run_reduction(self):
                 else:
                     flat_grads_list = list(flat_grads.split(len(flat_grads) // bucket_store.world_size))
                     recieved_grad = torch.zeros_like(flat_grads_list[0])
-                    dist.reduce_scatter(recieved_grad, flat_grads_list, group=bucket_store.torch_pg)
+                    if self._fp8_communication:
+                        reduce_scatter_fp8(
+                            recieved_grad,
+                            flat_grads_list,
+                            group=bucket_store.torch_pg,
+                        )
+                    else:
+                        dist.reduce_scatter(recieved_grad, flat_grads_list, group=bucket_store.torch_pg)
 
                     if recieved_grad.dtype != grad_dtype:
                         recieved_grad = recieved_grad.to(grad_dtype)
@@ -553,18 +566,21 @@ def step(self, closure=None):
                     buffer_tensor = torch.empty_like(
                         torch.cat([param_to_gather for _ in range(dist.get_world_size(pg))])
                     )
-                    dist.all_gather_into_tensor(buffer_tensor, param_to_gather, pg)
+                    if self._fp8_communication:
+                        all_gather_into_tensor_flat_fp8(buffer_tensor, param_to_gather, pg, fp8_format="e4m3")
+                    else:
+                        dist.all_gather_into_tensor(buffer_tensor, param_to_gather, pg)
                     working_param.data.copy_(buffer_tensor[: working_param.numel()].reshape_as(working_param))
                     continue
                 try:
                     self.pg_to_tensor_bucket[pg].add_to_bucket(param_to_gather, write_back_tensor=working_param)
                 except RuntimeError:
-                    self.pg_to_tensor_bucket[pg].all_gather(pg)
+                    self.pg_to_tensor_bucket[pg].all_gather(pg, fp8_communication=self._fp8_communication)
                     self.pg_to_tensor_bucket[pg].add_to_bucket(param_to_gather, write_back_tensor=working_param)
             self.optim.param_groups[group_id]["params"] = self._master_param_groups_of_current_rank[group_id]
         for pg, tensor_bucket in self.pg_to_tensor_bucket.items():
             if not tensor_bucket.is_empty():
-                tensor_bucket.all_gather(pg)
+                tensor_bucket.all_gather(pg, fp8_communication=self._fp8_communication)
 
     def _compute_grad_norm(self, dp_pg: ProcessGroup, gradients: List[Tensor], norm_type: int = 2) -> float:
         r"""

From 91e596d0178ab043e32d96c3b23eb9e9efa0a69b Mon Sep 17 00:00:00 2001
From: ver217 <lhx0217@gmail.com>
Date: Fri, 2 Aug 2024 11:28:38 +0800
Subject: [PATCH 014/167] [test] add zero fp8 test case

---
 .../test_zero/test_low_level/test_zero1_2.py  | 23 +++++++++++++++----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/tests/test_zero/test_low_level/test_zero1_2.py b/tests/test_zero/test_low_level/test_zero1_2.py
index 8df35bdaa968..cda51c4ef435 100644
--- a/tests/test_zero/test_low_level/test_zero1_2.py
+++ b/tests/test_zero/test_low_level/test_zero1_2.py
@@ -51,7 +51,8 @@ def split_ddp_grad(grad, world_size):
     return splited_grad
 
 
-def exam_zero_1_2():
+@parameterize("fp8_communication", [True, False])
+def exam_zero_1_2(fp8_communication: bool):
     """
     In this test, we want to test whether zero stage 1 and 2
     deliver the same numerical results despite different communication
@@ -73,10 +74,18 @@ def exam_zero_1_2():
     zero1_optimizer = torch.optim.Adam(zero1_model.parameters(), lr=1)
     zero2_optimizer = torch.optim.Adam(zero2_model.parameters(), lr=1)
     zero1_optimizer = LowLevelZeroOptimizer(
-        zero1_optimizer, overlap_communication=True, initial_scale=128, verbose=True
+        zero1_optimizer,
+        overlap_communication=True,
+        initial_scale=128,
+        verbose=True,
+        fp8_communication=fp8_communication,
     )
     zero2_optimizer = LowLevelZeroOptimizer(
-        zero2_optimizer, overlap_communication=True, partition_grad=True, initial_scale=128
+        zero2_optimizer,
+        overlap_communication=True,
+        partition_grad=True,
+        initial_scale=128,
+        fp8_communication=fp8_communication,
     )
     # create data
     seed_all(2001 + local_rank)
@@ -97,7 +106,10 @@ def exam_zero_1_2():
         if g1 is None or g2 is None:
             assert g1 is None and g2 is None
             continue
-        assert torch.allclose(g1, g2)
+        if fp8_communication:
+            loose_close(g1, g2, dtype=torch.float16)
+        else:
+            assert torch.allclose(g1, g2)
 
     # step
     zero1_optimizer.step()
@@ -105,7 +117,8 @@ def exam_zero_1_2():
 
     # check updated param
     for z1p, z2p in zip(zero1_model.parameters(), zero2_model.parameters()):
-        assert torch.allclose(z1p, z2p)
+        if not fp8_communication:
+            assert torch.allclose(z1p, z2p)
 
 
 @parameterize("dtype", [torch.float16, torch.bfloat16])

From 53cb9606bd86ed394e3a3d18a82fae5428f09155 Mon Sep 17 00:00:00 2001
From: Guangyao Zhang <xjtu521@qq.com>
Date: Mon, 5 Aug 2024 10:05:47 +0800
Subject: [PATCH 015/167] [Feature] llama shardformer fp8 support (#5938)

* add llama shardformer fp8

* Llama Shardformer Parity

* fix typo

* fix all reduce

* fix pytest failure

* fix reduce op and move function to fp8.py

* fix typo
---
 colossalai/quantization/fp8.py                | 113 +++++++++++--
 colossalai/shardformer/layer/_operation.py    | 157 +++++++++++-------
 colossalai/shardformer/layer/linear.py        |  22 ++-
 colossalai/shardformer/modeling/llama.py      |  30 ++--
 colossalai/shardformer/policies/llama.py      |  15 +-
 tests/test_fp8/test_fp8_all_to_all.py         |  39 +++++
 tests/test_fp8/test_fp8_all_to_all_single.py  |  37 +++++
 ...llgather.py => test_fp8_allgather_flat.py} |   4 +-
 tests/test_fp8/test_fp8_allreduce.py          |  48 ++++++
 tests/test_fp8/test_fp8_gather.py             |  48 ++++++
 tests/test_fp8/test_fp8_reduce_scatter.py     |  38 +++++
 11 files changed, 453 insertions(+), 98 deletions(-)
 create mode 100644 tests/test_fp8/test_fp8_all_to_all.py
 create mode 100644 tests/test_fp8/test_fp8_all_to_all_single.py
 rename tests/test_fp8/{test_fp8_allgather.py => test_fp8_allgather_flat.py} (96%)
 create mode 100644 tests/test_fp8/test_fp8_allreduce.py
 create mode 100644 tests/test_fp8/test_fp8_gather.py
 create mode 100644 tests/test_fp8/test_fp8_reduce_scatter.py

diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index fe5bd5744e69..b003e90e89f8 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -3,9 +3,11 @@
 import numpy as np
 import torch
 import torch.distributed as dist
+import torch.nn.functional as F
+from torch.distributed import ReduceOp
 
 
-def cast_to_fp8(inp: torch.Tensor, fp8_format="e4m3") -> (torch.Tensor, torch.Tensor):
+def cast_to_fp8(inp: torch.Tensor, fp8_format="e4m3", per_channel_scale=False) -> (torch.Tensor, torch.Tensor):
     r"""
     casting torch Tensor into specified fp8 tensor with per-channel scaling or per-tensor scaling.
     Args:
@@ -23,7 +25,7 @@ def cast_to_fp8(inp: torch.Tensor, fp8_format="e4m3") -> (torch.Tensor, torch.Te
     fp8_type = torch.float8_e4m3fn if fp8_format == "e4m3" else torch.float8_e5m2
     fp8_max = torch.finfo(fp8_type).max
 
-    if inp.dim() == 2:
+    if per_channel_scale:
         per_channel_max = inp.abs().max(dim=-1).values.float()
         per_channel_max = torch.where(per_channel_max > 0, per_channel_max, 1.0)
         scale = fp8_max / per_channel_max[:, None]
@@ -37,7 +39,9 @@ def cast_to_fp8(inp: torch.Tensor, fp8_format="e4m3") -> (torch.Tensor, torch.Te
     return ret, scale_inv
 
 
-def cast_from_fp8(inp: torch.Tensor, scale_inv: torch.Tensor, ret_type: torch.dtype) -> torch.Tensor:
+def cast_from_fp8(
+    inp: torch.Tensor, scale_inv: torch.Tensor, ret_type: torch.dtype, per_channel_scale=False
+) -> torch.Tensor:
     r"""
     Args:
         inp: should be a fp8 torch tensor in one of the types: [torch.float8_e4m3fn, torch.float8_e5m2].
@@ -49,20 +53,23 @@ def cast_from_fp8(inp: torch.Tensor, scale_inv: torch.Tensor, ret_type: torch.dt
     if inp.dtype not in [torch.float8_e4m3fn, torch.float8_e5m2]:
         raise TypeError("Only float8_e4m3fn and float8_e5m2 are allowed.")
 
-    if inp.dim() == 2:
+    if per_channel_scale:
         ret = scale_inv[:, None] * inp.float()
     else:
         ret = scale_inv * inp.float()
     return ret.to(ret_type)
 
 
-def all_reduce_fp8(tensor: torch.Tensor, fp8_format="e5m2", group=None) -> None:
+def all_reduce_fp8(tensor: torch.Tensor, fp8_format="e4m3", op=ReduceOp.SUM, group=None) -> None:
     r"""
     This is an in-place operation for compressed all_reduce using fp8.
     It works like dist.all_reduce but during communication the data is cast to fp8 format.
+
     Args:
         tensor: torch.Tensor in fp32, fp16, bf16 datatype.
         fp8_format: e4m3 or e5m2
+        op: ReduceOp.SUM or ReduceOp.AVG
+
     Returns:
         None
     """
@@ -72,18 +79,20 @@ def all_reduce_fp8(tensor: torch.Tensor, fp8_format="e5m2", group=None) -> None:
     input_shape = tensor.shape
     input_device = tensor.device
     input_size = tensor.numel()
-    tensor = tensor.flatten()
+    flat_padded_x = tensor.flatten()
 
-    fp8_type = torch.float8_e4m3fn if fp8_format == "e4m3" else torch.float8_e5m2
+    assert op in [ReduceOp.SUM, ReduceOp.AVG], "op can only be ReduceOp.SUM or ReduceOp.AVG"
 
-    ret, scale = cast_to_fp8(tensor, fp8_format=fp8_format)
+    if flat_padded_x.size(0) % world_size != 0:
+        pad_size = world_size - flat_padded_x.size(0) % world_size
+        flat_padded_x = F.pad(flat_padded_x, (0, pad_size))
+
+    fp8_type = torch.float8_e4m3fn if fp8_format == "e4m3" else torch.float8_e5m2
+    ret, scale = cast_to_fp8(flat_padded_x, fp8_format=fp8_format)
 
     inp = ret.view(torch.uint8)
     input_chunks = list(torch.chunk(inp, world_size, dim=0))
-    if dist.get_rank() == world_size - 1:
-        output_chunks = [torch.empty_like(input_chunks[-1]) for _ in range(world_size)]
-    else:
-        output_chunks = [torch.empty_like(input_chunks[0]) for _ in range(world_size)]
+    output_chunks = list(torch.chunk(torch.empty_like(inp), world_size, dim=0))
     dist.all_to_all(output_chunks, input_chunks, group=group)
     scale_list = [torch.ones(1, dtype=scale.dtype, device=input_device) for _ in range(world_size)]
     dist.all_gather(scale_list, scale, group=group)
@@ -92,15 +101,18 @@ def all_reduce_fp8(tensor: torch.Tensor, fp8_format="e5m2", group=None) -> None:
         out = out.view(fp8_type)
         summed_out += cast_from_fp8(out, scale, input_type)
 
+    if op == ReduceOp.AVG:
+        summed_out.div_(world_size)
+
     summed_out_fp8, scale = cast_to_fp8(summed_out, fp8_format=fp8_format)
     dist.all_gather(scale_list, scale, group=group)
 
-    tensor_list = list(torch.chunk(torch.empty(input_size, device=input_device, dtype=torch.uint8), world_size, dim=0))
+    tensor_list = [torch.empty_like(summed_out_fp8.view(torch.uint8)) for _ in range(world_size)]
     dist.all_gather(tensor_list, summed_out_fp8.view(torch.uint8), group=group)
     for i in range(world_size):
         tensor_list[i] = tensor_list[i].view(fp8_type).to(input_type) * scale_list[i]
-    tensor_out = torch.cat(tensor_list, dim=0)
-    tensor.data = tensor_out.view(input_shape).to(input_type)
+    out = torch.cat(tensor_list, dim=0)
+    tensor.copy_(out[:input_size].view(input_shape).to(input_type))
 
 
 def cast_to_fp8_pipeline(inp: Any) -> None:
@@ -276,5 +288,74 @@ def all_gather_into_tensor_flat_fp8(
     dist.all_gather_into_tensor(buffer.view(torch.uint8), fp8_input.view(torch.uint8), group=group)
     numel = np.prod(output_shape)
     valid_buffer = buffer[:numel].reshape(output_shape)
-    valid_buffer = cast_from_fp8(valid_buffer, scale_inv, input_type)
+    valid_buffer = cast_from_fp8(valid_buffer, scale_inv, input_type, per_channel_scale=(len(output_shape) == 2))
     output_tensor[:numel].copy_(valid_buffer.view(-1))
+
+
+def all_to_all_fp8(output_list, input_list, group=None, fp8_format="e5m2"):
+
+    world_size = dist.get_world_size(group)
+
+    input_type = input_list[0].dtype
+    fp8_type = torch.float8_e4m3fn if fp8_format == "e4m3" else torch.float8_e5m2
+    scale_list = []
+    tensor_list = []
+
+    for i in range(world_size):
+        input_tensor = input_list[i]
+        ret, scale = cast_to_fp8(input_tensor, fp8_format=fp8_format)
+        scale_list.append(scale)
+        ret = ret.view(torch.uint8)
+        tensor_list.append(ret)
+
+    output_scale_list = [torch.empty_like(x) for x in scale_list]
+    output_tensor_list = [torch.empty_like(x) for x in tensor_list]
+    dist.all_to_all(output_tensor_list, tensor_list, group=group)
+    dist.all_to_all(output_scale_list, scale_list, group=group)
+
+    for i in range(world_size):
+        scale = output_scale_list[i]
+        tensor = output_tensor_list[i]
+        tensor = tensor.view(fp8_type)
+        output_list[i].copy_(cast_from_fp8(tensor, scale, input_type))
+
+
+def all_to_all_single_fp8(output_tensor, input_tensor, group=None, fp8_format="e5m2"):
+
+    world_size = dist.get_world_size(group)
+
+    per_slice_len = input_tensor.size(0) // world_size
+    input_type = input_tensor.dtype
+    ret, scale = cast_to_fp8(input_tensor, fp8_format=fp8_format)
+    fp8_type = ret.dtype
+    input_tensor = ret.view(torch.uint8)
+    tensor = torch.empty_like(input_tensor)
+    scale_list = [torch.empty_like(scale) for _ in range(world_size)]
+    dist.all_to_all_single(tensor, input_tensor, group=group)
+    dist.all_gather(scale_list, scale, group=group)
+    cast_tensor_list = []
+
+    for i in range(world_size):
+        output_part = tensor[per_slice_len * i : per_slice_len * (i + 1)].view(fp8_type)
+        output_part = cast_from_fp8(output_part, scale_list[i], input_type)
+        cast_tensor_list.append(output_part)
+    output_tensor.copy_(torch.concatenate(cast_tensor_list, dim=0))
+
+
+def gather_fp8(output_list, input_, group=None, fp8_format="e5m2"):
+
+    world_size = dist.get_world_size(group)
+
+    input_type = input_.dtype
+    ret, scale = cast_to_fp8(input_, fp8_format=fp8_format)
+    fp8_type = ret.dtype
+    input_ = ret.view(torch.uint8)
+    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
+    scale_list = [torch.ones(1, dtype=scale.dtype, device=input_.device) for _ in range(world_size)]
+    dist.all_gather(tensor_list, input_, group=group)
+    dist.all_gather(scale_list, scale, group=group)
+
+    for i in range(world_size):
+        output = tensor_list[i].view(fp8_type)
+        scale = scale_list[i]
+        output_list[i].copy_(cast_from_fp8(output, scale, input_type))
diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index bbf1c862d820..a27fd35c192d 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -14,7 +14,13 @@
 except ImportError:
     _grad_accum_fusion_available = False
 
-from colossalai.quantization.fp8 import all_reduce_fp8, cast_from_fp8, cast_to_fp8, reduce_scatter_fp8
+from colossalai.quantization.fp8 import (
+    all_reduce_fp8,
+    all_to_all_fp8,
+    all_to_all_single_fp8,
+    gather_fp8,
+    reduce_scatter_fp8,
+)
 
 
 class FusedLayerNormAffineFunction1D(torch.autograd.Function):
@@ -117,11 +123,12 @@ class LinearWithAsyncCommunication(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, input_, weight, bias, process_group, async_grad_allreduce):
+    def forward(ctx, input_, weight, bias, process_group, async_grad_allreduce, fp8_communication=False):
         ctx.save_for_backward(input_, weight, bias)
         ctx.use_bias = bias is not None
         ctx.process_group = process_group
         ctx.async_grad_allreduce = async_grad_allreduce
+        ctx.fp8_communication = fp8_communication
         if bias is not None:
             output = F.linear(input_, weight, bias)
         else:
@@ -133,6 +140,7 @@ def forward(ctx, input_, weight, bias, process_group, async_grad_allreduce):
     def backward(ctx, grad_output):
         input, weight, bias = ctx.saved_tensors
         use_bias = ctx.use_bias
+        fp8_communication = ctx.fp8_communication
 
         # In order to be hooked into Gemini's '__torch_function__', adding a view operation to bias.
         if use_bias:
@@ -148,7 +156,10 @@ def backward(ctx, grad_output):
 
         if ctx.async_grad_allreduce:
             # Asynchronous all-reduce
-            handle = dist.all_reduce(grad_input, group=ctx.process_group, async_op=True)
+            if fp8_communication:
+                all_reduce_fp8(grad_input, group=ctx.process_group)
+            else:
+                handle = dist.all_reduce(grad_input, group=ctx.process_group, async_op=True)
             # Relay on CUDA_DEVICE_MAX_CONNECTIONS=1 to have
             # all-reduce scheduled first and have GPU resources allocated, CUDA_DEVICE_MAX_CONNECTIONS=1 is set in shardformer.py
 
@@ -167,10 +178,10 @@ def backward(ctx, grad_output):
 
         grad_bias = grad_output.sum(dim=0) if use_bias else None
 
-        if ctx.async_grad_allreduce:
+        if ctx.async_grad_allreduce and not fp8_communication:
             handle.wait()
 
-        return grad_input, grad_weight, grad_bias, None, None, None
+        return grad_input, grad_weight, grad_bias, None, None, None, None
 
 
 def _ring_as_gather(func, input_to_gather=None, input_local=None, process_group=None, gather_dim=1, keep_item=False):
@@ -238,16 +249,18 @@ class _GatherForwardReduceScatterBackward(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, input_, process_group, dim):
+    def forward(ctx, input_, process_group, dim, fp8_communication=False):
         ctx.process_group = process_group
         ctx.dim = dim
+        ctx.fp8_communication = fp8_communication
 
-        return _gather(input_, dim, process_group)
+        return _gather(input_, dim, process_group, fp8_communication, fp8_format="e4m3")
 
     @staticmethod
     def backward(ctx, grad_output):
         dim = ctx.dim
         process_group = ctx.process_group
+        fp8_communication = ctx.fp8_communication
         # do reduce-scatter
         new_shape = list(grad_output.shape)
         assert (
@@ -259,9 +272,12 @@ def backward(ctx, grad_output):
         ]
         output = torch.empty(new_shape, dtype=grad_output.dtype, device=grad_output.device)
 
-        dist.reduce_scatter(output, grad_list, group=process_group)
+        if fp8_communication:
+            reduce_scatter_fp8(output, grad_list, group=process_group, fp8_format="e5m2")
+        else:
+            dist.reduce_scatter(output, grad_list, group=process_group)
 
-        return output, None, None
+        return output, None, None, None
 
 
 class _LinearWithGatherForwardReduceScatterBackward(torch.autograd.Function):
@@ -577,12 +593,8 @@ def backward(ctx, grad_output):
         dim = ctx.dim
         process_group = ctx.process_group
         fp8_communication = ctx.fp8_communication
-        return (
-            _gather(grad_output, dim, process_group, fp8_communication=fp8_communication, fp8_format="e5m2"),
-            None,
-            None,
-            None,
-        )
+
+        return _gather(grad_output, dim, process_group, fp8_communication, fp8_format="e5m2"), None, None, None
 
 
 class _MatmulWithGatherForwardReduceScatterBackward(torch.autograd.Function):
@@ -816,26 +828,67 @@ class _AllToAll(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, input_, process_group, scatter_dim, gather_dim):
+    def forward(ctx, input_, process_group, scatter_dim, gather_dim, fp8_communication=False):
         ctx.process_group = process_group
         ctx.scatter_dim = scatter_dim
         ctx.gather_dim = gather_dim
+        ctx.fp8_communication = fp8_communication
         world_size = dist.get_world_size(process_group)
         bsz, _, _ = input_.shape
 
         # using all_to_all_single when batch size is 1
         if bsz == 1:
-            return _all_to_all_single(input_, world_size, process_group, scatter_dim, gather_dim)
+            return _all_to_all_single(
+                input_,
+                world_size,
+                process_group,
+                scatter_dim,
+                gather_dim,
+                fp8_communication=fp8_communication,
+                fp8_format="e4m3",
+            )
         else:
-            return _all_to_all(input_, world_size, process_group, scatter_dim, gather_dim)
+            return _all_to_all(
+                input_,
+                world_size,
+                process_group,
+                scatter_dim,
+                gather_dim,
+                fp8_communication=fp8_communication,
+                fp8_format="e4m3",
+            )
 
     @staticmethod
-    def backward(ctx, *grad_output):
+    def backward(ctx, grad_output):
         process_group = ctx.process_group
         scatter_dim = ctx.gather_dim
         gather_dim = ctx.scatter_dim
-        return_grad = _AllToAll.apply(*grad_output, process_group, scatter_dim, gather_dim)
-        return (return_grad, None, None, None)
+        fp8_communication = ctx.fp8_communication
+        world_size = dist.get_world_size(process_group)
+        bsz, _, _ = grad_output.shape
+
+        if bsz == 1:
+            return_grad = _all_to_all_single(
+                grad_output,
+                world_size,
+                process_group,
+                scatter_dim,
+                gather_dim,
+                fp8_communication=fp8_communication,
+                fp8_format="e5m2",
+            )
+        else:
+            return_grad = _all_to_all(
+                grad_output,
+                world_size,
+                process_group,
+                scatter_dim,
+                gather_dim,
+                fp8_communication=fp8_communication,
+                fp8_format="e5m2",
+            )
+
+        return (return_grad, None, None, None, None)
 
 
 class HookParameter(torch.autograd.Function):
@@ -899,33 +952,14 @@ def _gather(input_, dim=-1, process_group=None, fp8_communication=False, fp8_for
     if world_size == 1:
         return input_
 
+    input_ = input_.contiguous()
+    tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
     if fp8_communication:
-        input_type = input_.dtype
-        ret, scale = cast_to_fp8(input_, fp8_format=fp8_format)
-        fp8_type = ret.dtype
-        input_ = ret.view(torch.uint8)
-        input_ = input_.contiguous()
-        tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
-        scale = torch.tensor(scale, dtype=torch.float32).to(input_.device)
-        scale_list = [torch.ones(1, dtype=torch.float32, device=input_.device) for _ in range(world_size)]
-
-        scale = torch.tensor(scale).to(input_.device)
-        torch.distributed.all_gather(tensor_list, input_, group=process_group)
-        torch.distributed.all_gather(scale_list, scale, group=process_group)
-
-        cast_tensor_list = []
-        for output, scale in zip(tensor_list, scale_list):
-            output = output.view(fp8_type)
-            output = cast_from_fp8(output, scale, input_type)
-            cast_tensor_list.append(output)
-
-        output = torch.cat(cast_tensor_list, dim=dim).contiguous()
-
+        gather_fp8(tensor_list, input_, fp8_format=fp8_format, group=process_group)
     else:
-        input_ = input_.contiguous()
-        tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
-        torch.distributed.all_gather(tensor_list, input_, group=process_group)
-        output = torch.cat(tensor_list, dim=dim).contiguous()
+        dist.all_gather(tensor_list, input_, group=process_group)
+
+    output = torch.cat(tensor_list, dim=dim).contiguous()
 
     return output
 
@@ -954,14 +988,19 @@ def _reduce_scatter(input_, dim=1, process_group=None):
     return output
 
 
-def _all_to_all(input_, world_size, group, scatter_dim, gather_dim):
+def _all_to_all(input_, world_size, group, scatter_dim, gather_dim, fp8_communication=False, fp8_format="e5m2"):
     input_list = [t.contiguous() for t in torch.tensor_split(input_, world_size, scatter_dim)]
     output_list = [torch.empty_like(input_list[0]) for _ in range(world_size)]
-    dist.all_to_all(output_list, input_list, group=group)
+    if fp8_communication:
+        all_to_all_fp8(output_list, input_list, group=group, fp8_format=fp8_format)
+    else:
+        dist.all_to_all(output_list, input_list, group=group)
     return torch.cat(output_list, dim=gather_dim).contiguous()
 
 
-def _all_to_all_single(input_, seq_world_size, group, scatter_dim, gather_dim):
+def _all_to_all_single(
+    input_, seq_world_size, group, scatter_dim, gather_dim, fp8_communication=False, fp8_format="e5m2"
+):
     inp_shape = list(input_.shape)
     inp_shape[scatter_dim] = inp_shape[scatter_dim] // seq_world_size
     if scatter_dim < 2:
@@ -974,7 +1013,11 @@ def _all_to_all_single(input_, seq_world_size, group, scatter_dim, gather_dim):
         )
 
     output = torch.empty_like(input_t)
-    dist.all_to_all_single(output, input_t, group=group)
+    if fp8_communication:
+        all_to_all_single_fp8(output, input_t, group=group, fp8_format=fp8_format)
+    else:
+
+        dist.all_to_all_single(output, input_t, group=group)
 
     if scatter_dim < 2:
         output = output.transpose(0, 1).contiguous()
@@ -994,8 +1037,10 @@ def matmul_with_async_comm(input_, weight, bias, process_group, async_grad_allre
     )
 
 
-def linear_with_async_comm(input_, weight, bias, process_group, async_grad_allreduce):
-    return LinearWithAsyncCommunication.apply(input_, weight, bias, process_group, async_grad_allreduce)
+def linear_with_async_comm(input_, weight, bias, process_group, async_grad_allreduce, fp8_communication=False):
+    return LinearWithAsyncCommunication.apply(
+        input_, weight, bias, process_group, async_grad_allreduce, fp8_communication
+    )
 
 
 def linear_gather_forward_reducescatter_backward(
@@ -1006,8 +1051,8 @@ def linear_gather_forward_reducescatter_backward(
     )
 
 
-def gather_forward_reducescatter_backward(input_, process_group, dim):
-    return _GatherForwardReduceScatterBackward.apply(input_, process_group, dim)
+def gather_forward_reducescatter_backward(input_, process_group, dim, fp8_communication=False):
+    return _GatherForwardReduceScatterBackward.apply(input_, process_group, dim, fp8_communication)
 
 
 def reducescatter_forward_gather_backward(input_, process_group, dim, fp8_communication=False):
@@ -1042,5 +1087,5 @@ def reduce_backward(input_, process_group, fp8_communication=False):
     return _ReduceBackward.apply(input_, process_group, fp8_communication)
 
 
-def all_to_all_comm(input_, process_group=None, scatter_dim=2, gather_dim=1):
-    return _AllToAll.apply(input_, process_group, scatter_dim, gather_dim)
+def all_to_all_comm(input_, process_group=None, scatter_dim=2, gather_dim=1, fp8_communication=False):
+    return _AllToAll.apply(input_, process_group, scatter_dim, gather_dim, fp8_communication)
diff --git a/colossalai/shardformer/layer/linear.py b/colossalai/shardformer/layer/linear.py
index 37c7542416f6..38a6ef1a19ae 100644
--- a/colossalai/shardformer/layer/linear.py
+++ b/colossalai/shardformer/layer/linear.py
@@ -84,6 +84,7 @@ def __init__(
         bias_: Optional[Parameter] = None,
         weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
         bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
+        fp8_communication: bool = False,
         **kwargs,
     ):
         super().__init__(weight=weight, bias_=bias_, **kwargs)
@@ -98,6 +99,7 @@ def __init__(
         self.skip_bias_add = skip_bias_add
         self.device = device
         self.process_group = process_group
+        self.fp8_communication = fp8_communication
 
         if skip_bias_add and not bias:
             raise ValueError("cannot skip bias addition if bias is None")
@@ -201,10 +203,12 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
         bias = self.bias if not self.skip_bias_add else None
 
         if self.seq_parallel_mode is None:
-            output_parallel = linear_with_async_comm(input_parallel, self.weight, bias, self.process_group, True)
+            output_parallel = linear_with_async_comm(
+                input_parallel, self.weight, bias, self.process_group, True, fp8_communication=self.fp8_communication
+            )
         elif self.seq_parallel_mode == "split_gather":
             input_parallel = gather_forward_reducescatter_backward(
-                input_parallel, self.process_group, self.seq_parallel_dim
+                input_parallel, self.process_group, self.seq_parallel_dim, fp8_communication=self.fp8_communication
             )
             output_parallel = linear_with_async_comm(input_parallel, self.weight, bias, self.process_group, False)
         elif self.seq_parallel_mode == "ring":
@@ -214,7 +218,9 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
 
         if self.gather_output:
             # All-gather across the partitions.
-            output = gather_forward_split_backward(output_parallel, dim=-1, process_group=self.process_group)
+            output = gather_forward_split_backward(
+                output_parallel, dim=-1, process_group=self.process_group, fp8_communication=self.fp8_communication
+            )
         else:
             output = output_parallel
 
@@ -264,6 +270,7 @@ def __init__(
         weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
         bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
         stream_chunk_num: int = 1,
+        fp8_communication: bool = False,
     ):
         super().__init__()
 
@@ -278,6 +285,7 @@ def __init__(
         self.seq_parallel_mode = seq_parallel_mode
         self.seq_parallel_dim = seq_parallel_dim
         self.num_partitions = dist.get_world_size(self.process_group)
+        self.fp8_communication = fp8_communication
 
         if skip_bias_add and not bias:
             raise ValueError("cannot skip bias addition if bias is None")
@@ -398,7 +406,9 @@ def forward(self, input_: Tensor) -> Tensor:
             ), "Invalid shapes in Linear1D_Row forward: input={}, weight={}. Expected last dim of input {}.".format(
                 input_.shape, self.weight.shape, self.weight.shape[-1] * self.num_partitions
             )
-            input_ = split_forward_gather_backward(input_, dim=-1, process_group=self.process_group)
+            input_ = split_forward_gather_backward(
+                input_, dim=-1, process_group=self.process_group, fp8_communication=self.fp8_communication
+            )
 
         if self.stream_chunk_num > 1:
             if self.training:
@@ -418,11 +428,11 @@ def forward(self, input_: Tensor) -> Tensor:
         else:
             if self.seq_parallel_mode is None:
                 output_parallel = linear_with_async_comm(input_, self.weight, None, self.process_group, False)
-                output = reduce_forward(output_parallel, self.process_group)
+                output = reduce_forward(output_parallel, self.process_group, fp8_communication=self.fp8_communication)
             elif self.seq_parallel_mode == "split_gather":
                 output_parallel = linear_with_async_comm(input_, self.weight, None, self.process_group, False)
                 output = reducescatter_forward_gather_backward(
-                    output_parallel, self.process_group, self.seq_parallel_dim
+                    output_parallel, self.process_group, self.seq_parallel_dim, fp8_communication=self.fp8_communication
                 )
             elif self.seq_parallel_mode == "ring":
                 output = linear_reducescatter_forward_gather_backward(
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index bf5ce45a8342..f83735f0520d 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -460,7 +460,7 @@ def llama_for_sequence_classification_forward(
             return {"hidden_states": hidden_states}
 
 
-def get_llama_flash_attention_forward(shard_config, sp_mode=None, sp_size=None, sp_group=None):
+def get_llama_flash_attention_forward(shard_config: ShardConfig, sp_mode=None, sp_size=None, sp_group=None):
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -510,9 +510,9 @@ def forward(
 
         # sp: all-to-all comminucation when introducing sequence parallel
         if sp_mode == "all_to_all":
-            query_states = all_to_all_comm(query_states, sp_group)
-            key_states = all_to_all_comm(key_states, sp_group)
-            value_states = all_to_all_comm(value_states, sp_group)
+            query_states = all_to_all_comm(query_states, sp_group, fp8_communication=shard_config.fp8_communication)
+            key_states = all_to_all_comm(key_states, sp_group, fp8_communication=shard_config.fp8_communication)
+            value_states = all_to_all_comm(value_states, sp_group, fp8_communication=shard_config.fp8_communication)
             bsz, q_len, _ = query_states.size()
 
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
@@ -574,7 +574,9 @@ def forward(
         # sp: all-to-all comminucation when introducing sequence parallel
         if sp_mode == "all_to_all":
             attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim)
-            attn_output = all_to_all_comm(attn_output, sp_group, scatter_dim=1, gather_dim=2)
+            attn_output = all_to_all_comm(
+                attn_output, sp_group, scatter_dim=1, gather_dim=2, fp8_communication=shard_config.fp8_communication
+            )
         else:
             attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
 
@@ -592,7 +594,7 @@ def forward(
     return forward
 
 
-def get_llama_flash_attention_model_forward(shard_config, sp_mode=None, sp_size=None, sp_group=None):
+def get_llama_flash_attention_model_forward(shard_config: ShardConfig, sp_mode=None, sp_size=None, sp_group=None):
     logger = logging.get_logger(__name__)
 
     def forward(
@@ -659,9 +661,13 @@ def forward(
             attention_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
 
         if sp_mode in ["ring", "split_gather"]:
-            inputs_embeds = split_forward_gather_backward(inputs_embeds, 1, sp_group)
+            inputs_embeds = split_forward_gather_backward(
+                inputs_embeds, 1, sp_group, fp8_communication=shard_config.fp8_communication
+            )
         elif sp_mode == "all_to_all":
-            inputs_embeds = split_forward_gather_backward(inputs_embeds, 1, sp_group, 1 / sp_size)
+            inputs_embeds = split_forward_gather_backward(
+                inputs_embeds, 1, sp_group, 1 / sp_size, fp8_communication=shard_config.fp8_communication
+            )
         hidden_states = inputs_embeds
 
         # decoder layers
@@ -706,9 +712,13 @@ def forward(
         hidden_states = self.norm(hidden_states)
 
         if sp_mode == "ring" or sp_mode == "split_gather":
-            hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group)
+            hidden_states = gather_forward_split_backward(
+                hidden_states, 1, sp_group, fp8_communication=shard_config.fp8_communication
+            )
         elif sp_mode == "all_to_all":
-            hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group, grad_scale=sp_size)
+            hidden_states = gather_forward_split_backward(
+                hidden_states, 1, sp_group, grad_scale=sp_size, fp8_communication=shard_config.fp8_communication
+            )
 
         # add hidden states from the last decoder layer
         if output_hidden_states:
diff --git a/colossalai/shardformer/policies/llama.py b/colossalai/shardformer/policies/llama.py
index 85ec6717d0d1..0f28f6cf49a9 100644
--- a/colossalai/shardformer/policies/llama.py
+++ b/colossalai/shardformer/policies/llama.py
@@ -65,7 +65,6 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
             norm_cls = FusedRMSNorm
         else:
             norm_cls = RMSNorm
-
         if self.pipeline_stage_manager is not None:
             self.shard_config.enable_sequence_parallelism = False
             self.shard_config.enable_sequence_overlap = False
@@ -134,37 +133,37 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                     SubModuleReplacementDescription(
                         suffix="self_attn.q_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.k_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.v_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.o_proj",
                         target_module=Linear1D_Row,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.gate_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.up_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.down_proj",
                         target_module=Linear1D_Row,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                 ],
             )
diff --git a/tests/test_fp8/test_fp8_all_to_all.py b/tests/test_fp8/test_fp8_all_to_all.py
new file mode 100644
index 000000000000..884aab744ba0
--- /dev/null
+++ b/tests/test_fp8/test_fp8_all_to_all.py
@@ -0,0 +1,39 @@
+import torch
+import torch.distributed as dist
+from torch.distributed.distributed_c10d import _get_default_group
+from torch.testing import assert_close
+
+from colossalai import launch
+from colossalai.accelerator import get_accelerator
+from colossalai.quantization.fp8 import all_to_all_fp8
+from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+
+
+@parameterize("shape", [(16, 8, 4)])
+@parameterize("scatter_dim", [0, 1, 2])
+@parameterize("dtype", [torch.bfloat16, torch.float16])
+@parameterize("fp8_format", ["e4m3", "e5m2"])
+def check_4gpu(shape, scatter_dim, dtype, fp8_format):
+    world_size = dist.get_world_size()
+    input_tensor = torch.rand(shape, dtype=dtype, device=get_accelerator().get_current_device())
+    input_tensor_list = list(torch.chunk(input_tensor, world_size, scatter_dim))
+    input_tensor_list = [x.contiguous() for x in input_tensor_list]
+    output_tensor_list_fp8 = [torch.empty_like(x) for x in input_tensor_list]
+    output_tensor_list = [torch.empty_like(x) for x in input_tensor_list]
+    all_to_all_fp8(output_tensor_list_fp8, input_tensor_list, group=_get_default_group(), fp8_format=fp8_format)
+    dist.all_to_all(output_tensor_list, input_tensor_list, group=_get_default_group())
+    assert_close(output_tensor_list_fp8, output_tensor_list, rtol=0.1, atol=0.1)
+
+
+def run_dist(rank, world_size, port):
+    launch(rank=rank, world_size=world_size, port=port, host="localhost")
+    check_4gpu()
+
+
+@rerun_if_address_is_in_use()
+def test_all_to_all():
+    spawn(run_dist, 4)
+
+
+if __name__ == "__main__":
+    test_all_to_all()
diff --git a/tests/test_fp8/test_fp8_all_to_all_single.py b/tests/test_fp8/test_fp8_all_to_all_single.py
new file mode 100644
index 000000000000..70765f2d48de
--- /dev/null
+++ b/tests/test_fp8/test_fp8_all_to_all_single.py
@@ -0,0 +1,37 @@
+import torch
+import torch.distributed as dist
+from torch.distributed.distributed_c10d import _get_default_group
+from torch.testing import assert_close
+
+from colossalai import launch
+from colossalai.accelerator import get_accelerator
+from colossalai.quantization.fp8 import all_to_all_single_fp8
+from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+
+dist.all_to_all_single
+
+
+@parameterize("shape", [(4), (8, 7), (4, 8, 16)])
+@parameterize("dtype", [torch.bfloat16, torch.float16])
+@parameterize("fp8_format", ["e4m3", "e5m2"])
+def check_4gpu(shape, dtype, fp8_format):
+    x = torch.rand(shape, dtype=dtype, device=get_accelerator().get_current_device())
+    output = torch.empty_like(x)
+    output_fp8 = torch.empty_like(x)
+    all_to_all_single_fp8(output_fp8, x, group=_get_default_group(), fp8_format=fp8_format)
+    dist.all_to_all_single(output, x, group=_get_default_group())
+    assert_close(output, output_fp8, rtol=0.1, atol=0.1)
+
+
+def run_dist(rank, world_size, port):
+    launch(rank=rank, world_size=world_size, port=port, host="localhost")
+    check_4gpu()
+
+
+@rerun_if_address_is_in_use()
+def test_all_to_all_single():
+    spawn(run_dist, 4)
+
+
+if __name__ == "__main__":
+    test_all_to_all_single()
diff --git a/tests/test_fp8/test_fp8_allgather.py b/tests/test_fp8/test_fp8_allgather_flat.py
similarity index 96%
rename from tests/test_fp8/test_fp8_allgather.py
rename to tests/test_fp8/test_fp8_allgather_flat.py
index 1a4c8511a843..35e8796c2882 100644
--- a/tests/test_fp8/test_fp8_allgather.py
+++ b/tests/test_fp8/test_fp8_allgather_flat.py
@@ -32,9 +32,9 @@ def run_dist(rank, world_size, port):
 
 
 @rerun_if_address_is_in_use()
-def test_all_gather():
+def test_all_gather_flat():
     spawn(run_dist, 4)
 
 
 if __name__ == "__main__":
-    test_all_gather()
+    test_all_gather_flat()
diff --git a/tests/test_fp8/test_fp8_allreduce.py b/tests/test_fp8/test_fp8_allreduce.py
new file mode 100644
index 000000000000..c23959b5d0da
--- /dev/null
+++ b/tests/test_fp8/test_fp8_allreduce.py
@@ -0,0 +1,48 @@
+import torch
+import torch.distributed as dist
+from torch.testing import assert_close
+
+from colossalai import launch
+from colossalai.accelerator import get_accelerator
+from colossalai.quantization.fp8 import all_reduce_fp8
+from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+
+
+@parameterize(
+    "shape",
+    [
+        (3, 7),
+        (4, 7),
+        (7, 4),
+        (8, 9),
+        (3),
+        (7,),
+        (8,),
+    ],
+)
+@parameterize("dtype", [torch.float16, torch.bfloat16])
+@parameterize("fp8_format", ["e4m3", "e5m2"])
+def check_4gpu(shape, dtype, fp8_format):
+    x = torch.rand(shape, dtype=dtype, device=get_accelerator().get_current_device())
+    x_fp8 = x.clone()
+    dist.all_reduce(x)
+    all_reduce_fp8(x_fp8, fp8_format=fp8_format)
+    assert_close(x, x_fp8, rtol=0.1, atol=0.1)
+
+    dist.all_reduce(x, op=dist.ReduceOp.AVG)
+    all_reduce_fp8(x_fp8, op=dist.ReduceOp.AVG, fp8_format=fp8_format)
+    assert_close(x, x_fp8, rtol=0.1, atol=0.1)
+
+
+def run_dist(rank, world_size, port):
+    launch(rank=rank, world_size=world_size, port=port, host="localhost")
+    check_4gpu()
+
+
+@rerun_if_address_is_in_use()
+def test_all_reduce():
+    spawn(run_dist, 4)
+
+
+if __name__ == "__main__":
+    test_all_reduce()
diff --git a/tests/test_fp8/test_fp8_gather.py b/tests/test_fp8/test_fp8_gather.py
new file mode 100644
index 000000000000..79d1d4ea49e6
--- /dev/null
+++ b/tests/test_fp8/test_fp8_gather.py
@@ -0,0 +1,48 @@
+import torch
+import torch.distributed as dist
+from torch.distributed.distributed_c10d import _get_default_group
+from torch.testing import assert_close
+
+from colossalai import launch
+from colossalai.accelerator import get_accelerator
+from colossalai.quantization.fp8 import gather_fp8
+from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+
+
+@parameterize(
+    "shape",
+    [
+        (3, 7),
+        (2, 1),
+        (1, 2),
+        (2, 2),
+        (4, 2),
+        (5,),
+        (4,),
+        (2,),
+    ],
+)
+@parameterize("dtype", [torch.bfloat16, torch.float16])
+@parameterize("fp8_format", ["e4m3", "e5m2"])
+def check_4gpu(shape, dtype, fp8_format):
+    world_size = dist.get_world_size()
+    x = torch.rand(shape, dtype=dtype, device=get_accelerator().get_current_device())
+    output_list = [torch.empty_like(x) for _ in range(world_size)]
+    output_list_fp8 = [torch.empty_like(x) for _ in range(world_size)]
+    gather_fp8(output_list_fp8, x, group=_get_default_group(), fp8_format=fp8_format)
+    dist.all_gather(output_list, x, group=_get_default_group())
+    assert_close(output_list, output_list_fp8, rtol=0.1, atol=0.1)
+
+
+def run_dist(rank, world_size, port):
+    launch(rank=rank, world_size=world_size, port=port, host="localhost")
+    check_4gpu()
+
+
+@rerun_if_address_is_in_use()
+def test_all_gather():
+    spawn(run_dist, 4)
+
+
+if __name__ == "__main__":
+    test_all_gather()
diff --git a/tests/test_fp8/test_fp8_reduce_scatter.py b/tests/test_fp8/test_fp8_reduce_scatter.py
new file mode 100644
index 000000000000..c18446e39ea0
--- /dev/null
+++ b/tests/test_fp8/test_fp8_reduce_scatter.py
@@ -0,0 +1,38 @@
+import torch
+from torch.distributed import reduce_scatter
+from torch.distributed.distributed_c10d import _get_default_group
+from torch.testing import assert_close
+
+from colossalai import launch
+from colossalai.accelerator import get_accelerator
+from colossalai.quantization.fp8 import reduce_scatter_fp8
+from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+
+
+@parameterize("shape", [(16, 8, 4)])
+@parameterize("scatter_dim", [0, 1, 2])
+@parameterize("dtype", [torch.bfloat16, torch.float16])
+@parameterize("fp8_format", ["e4m3", "e5m2"])
+def check_4gpu(shape, scatter_dim, dtype, fp8_format):
+    x = torch.rand(shape, dtype=dtype, device=get_accelerator().get_current_device())
+    input_list = list(torch.chunk(x, dim=scatter_dim, chunks=4))
+    input_list = [t.contiguous() for t in input_list]
+    output_origin = torch.empty_like(input_list[0])
+    output_fp8 = torch.empty_like(input_list[0])
+    reduce_scatter(output_origin, input_list, group=_get_default_group())
+    reduce_scatter_fp8(output_fp8, input_list, group=_get_default_group(), fp8_format=fp8_format)
+    assert_close(output_origin, output_fp8, rtol=0.1, atol=0.1)
+
+
+def run_dist(rank, world_size, port):
+    launch(rank=rank, world_size=world_size, port=port, host="localhost")
+    check_4gpu()
+
+
+@rerun_if_address_is_in_use()
+def test_reduce_scatter():
+    spawn(run_dist, 4)
+
+
+if __name__ == "__main__":
+    test_reduce_scatter()

From 0c10afd372de3b02a508e11a90c9a738a3dd072d Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Tue, 6 Aug 2024 16:29:37 +0800
Subject: [PATCH 016/167] [FP8] rebase main (#5963)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add SimPO

* fix dataloader

* remove debug code

* add orpo

* fix style

* fix colossalai, transformers version

* fix colossalai, transformers version

* fix colossalai, transformers version

* fix torch colossalai version

* update transformers version

* [shardformer] DeepseekMoE support (#5871)

* [Feature] deepseek moe expert parallel implement

* [misc] fix typo, remove redundant file (#5867)

* [misc] fix typo

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [Feature] deepseek support & unit test

* [misc] remove debug code & useless print

* [misc] fix typos (#5872)

* [Feature] remove modeling file, use auto config. (#5884)

* [misc] fix typos

* [Feature] deepseek support via auto model, remove modeling file

* [misc] delete useless file

* [misc] fix typos

* [Deepseek] remove redundant code (#5888)

* [misc] fix typos

* [Feature] deepseek support via auto model, remove modeling file

* [misc] delete useless file

* [misc] fix typos

* [misc] remove redundant code

* [Feature/deepseek] resolve comment. (#5889)

* [misc] fix typos

* [Feature] deepseek support via auto model, remove modeling file

* [misc] delete useless file

* [misc] fix typos

* [misc] remove redundant code

* [misc] mv module replacement into if branch

* [misc] add some warning message and modify some code in unit test

* [misc] fix typos

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap

Co-authored-by: Edenzzzz <wtan45@wisc.edu>

* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)

* Diffusion Model Inference support

* Stable Diffusion 3 Support

* pixartalpha support

* [HotFix] CI,import,requirements-test for #5838 (#5892)

* [Hot Fix] CI,import,requirements-test

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [Feature] Enable PP + SP for llama (#5868)

* fix cross-PP-stage position id length diff bug

* fix typo

* fix typo

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* use a one cross entropy func for all shardformer models

---------

Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)

* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint

* fix style

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix eval

* hotfix citation

* [zero] support all-gather overlap (#5898)

* [zero] support all-gather overlap

* [zero] add overlap all-gather flag

* [misc] fix typo

* [zero] update api

* fix orpo cross entropy loss

* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)

* Remove unnecessary calls to deepcopy

* Build DimSpec's difference dict only once

This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.

* Fix documentation of DimSpec's difference method

* [ShardFormer] fix qwen2 sp (#5903)

* [compatibility] support torch 2.2 (#5875)

* Support Pytorch 2.2.2

* keep build_on_pr file and update .compatibility

* fix object_to_tensor usage when torch>=2.3.0 (#5820)

* [misc] support torch2.3 (#5893)

* [misc] support torch2.3

* [devops] update compatibility ci

* [devops] update compatibility ci

* [devops] add debug

* [devops] add debug

* [devops] add debug

* [devops] add debug

* [devops] remove debug

* [devops] remove debug

* [release] update version (#5912)

* [plugin] support all-gather overlap for hybrid parallel (#5919)

* [plugin] fixed all-gather overlap support for hybrid parallel

* add kto

* fix style, add kto data sample

* [Examples] Add lazy init to OPT and GPT examples (#5924)

Co-authored-by: Edenzzzz <wtan45@wisc.edu>

* [ColossalChat] Hotfix for ColossalChat (#5910)

* add ignore and tiny llama

* fix path issue

* run style

* fix issue

* update bash

* add ignore and tiny llama

* fix path issue

* run style

* fix issue

* update bash

* fix ddp issue

* add Qwen 1.5 32B

* refactor tokenization

* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)

* cannot access local variable 'default_conversation' where it is not associated with a value

set default value for 'default_conversation'

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* fix test data

* refactor evaluation

* remove real data path

* remove real data path

* Add n_fused as an input from native_module (#5894)

* [FIX BUG] convert env param to int in (#5934)

* [Hotfix] Fix ZeRO typo #5936

Co-authored-by: Edenzzzz <wtan45@wisc.edu>

* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)

* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* fix style

* fix style

* fix style

* [shardformer] hotfix attn mask (#5945)

* [shardformer] hotfix attn mask (#5947)

* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)

* Distrifusion Support source

* comp comm overlap optimization

* sd3 benchmark

* pixart distrifusion bug fix

* sd3 bug fix and benchmark

* generation bug fix

* naming fix

* add docstring, fix counter and shape error

* add reference

* readme and requirement

* [zero] hotfix update master params (#5951)

* [release] update version (#5952)

* [Chat] Fix lora (#5946)

* fix merging

* remove filepath

* fix style

* Update README.md (#5958)

* [hotfix] Remove unused plan section (#5957)

* remove readme

* fix readme

* update

* [test] add mixtral for sequence classification

* [test] add mixtral transformer test

* [moe] fix plugin

* [test] mixtra pp shard test

* [chore] handle non member group

* [zero] solve hang

* [test] pass mixtral shardformer test

* [moe] implement transit between non moe tp and ep

* [zero] solve hang

* [misc] solve booster hang by rename the variable

* solve hang when parallel mode = pp + dp

* [moe] implement submesh initialization

* [moe] add mixtral dp grad scaling when not all experts are activated

* [chore] manually revert unintended commit

* [chore] trivial fix

* [chore] arg pass & remove drop token

* [test] add mixtral modelling test

* [moe] implement tp

* [moe] test deepseek

* [moe] clean legacy code

* [Feature] MoE Ulysses Support (#5918)

* moe sp support

* moe sp bug solve

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [chore] minor fix

* [moe] init moe plugin comm setting with sp

* moe sp + ep bug fix

* [moe] finalize test (no pp)

* [moe] full test for deepseek and mixtral (pp + sp to fix)

* [chore] minor fix after rebase

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [chore] solve moe ckpt test failure and some other arg pass failure

* [moe] remove ops

* [test] fix test: test_zero1_2

* [bug] fix: somehow logger hangs the program

* [moe] deepseek moe sp support

* [test] add check

* [deepseek] replace attn (a workaround for bug in transformers)

* [misc] skip redunant test

* [misc] remove debug/print code

* [moe] refactor mesh assignment

* Revert "[moe] implement submesh initialization"

This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.

* [chore] change moe_pg_mesh to private

* [misc] remove incompatible test config

* [misc] fix ci failure: change default value to false in moe plugin

* [misc] remove useless condition

* [chore] docstring

* [moe] remove force_overlap_comm flag and add warning instead

* [doc] add MoeHybridParallelPlugin docstring

* [moe] solve dp axis issue

* [chore] remove redundant test case, print string & reduce test tokens

* [feat] Dist Loader for Eval (#5950)

* support auto distributed data loader

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* support auto distributed data loader

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix tp error

* remove unused parameters

* remove unused

* update inference

* update docs

* update inference

---------

Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [lora] lora support hybrid parallel plugin (#5956)

* lora support hybrid plugin

* fix

* fix

* fix

* fix

* fp8 operators for compressed communication

cast_to_fp8, cast_from_fp8, all_reduce_fp8

* fix scaling algorithm in FP8 casting

* support fp8 communication in pipeline parallelism

* add fp8_communication flag in the script

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix typo

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* shardformer fp8

* fix rebase

* remove all to all

* fix shardformer fp8 communication training degradation

* [fp8] support all-gather flat tensor (#5932)

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix

* Update low_level_optim.py

---------

Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Co-authored-by: HangXu <hangxu0304@gmail.com>
---
 .compatibility                                |   2 +
 .../compatiblity_test_on_dispatch.yml         |  32 +-
 .github/workflows/compatiblity_test_on_pr.yml |  33 +-
 .../compatiblity_test_on_schedule.yml         |  33 +-
 .github/workflows/run_chatgpt_examples.yml    |   2 +
 .../Colossal-LLaMA/prepare_sft_dataset.py     |   4 +-
 applications/Colossal-LLaMA/train.py          |  18 +-
 applications/ColossalChat/.gitignore          |   3 +
 applications/ColossalChat/README.md           |  74 +-
 .../ColossalChat/benchmarks/benchmark_dpo.sh  |  51 ++
 .../ColossalChat/benchmarks/benchmark_kto.sh  |  51 ++
 .../ColossalChat/benchmarks/benchmark_orpo.sh |  51 ++
 .../ColossalChat/benchmarks/benchmark_sft.sh  |  50 ++
 .../benchmarks/benchmark_simpo.sh             |  55 ++
 .../ColossalChat/benchmarks/dummy_dataset.py  |  30 +
 .../benchmarks/prepare_dummy_test_dataset.py  | 105 +++
 .../ColossalChat/coati/dataset/__init__.py    |  10 +-
 .../coati/dataset/conversation.py             |   3 +-
 .../ColossalChat/coati/dataset/loader.py      |  87 ++
 .../coati/dataset/tokenization_utils.py       | 354 ++++----
 .../ColossalChat/coati/dataset/utils.py       |  37 +-
 .../ColossalChat/coati/models/__init__.py     |   8 +-
 .../ColossalChat/coati/models/base.py         |   1 -
 .../ColossalChat/coati/models/lora.py         | 342 ++++++--
 .../ColossalChat/coati/models/loss.py         | 113 ++-
 .../ColossalChat/coati/models/utils.py        |  10 +-
 .../ColossalChat/coati/trainer/__init__.py    |  13 +-
 .../ColossalChat/coati/trainer/dpo.py         |  48 +-
 .../ColossalChat/coati/trainer/kto.py         | 318 +++++++
 .../ColossalChat/coati/trainer/orpo.py        | 314 +++++++
 applications/ColossalChat/coati/trainer/rm.py |   1 +
 .../ColossalChat/coati/trainer/sft.py         |   3 +
 .../Qwen_Qwen1.5-32B-Chat.json                |   9 +
 .../conversation_template/tiny-llama.json     |   8 +
 applications/ColossalChat/examples/README.md  | 246 ++++--
 .../prepare_dataset.py                        |  23 +-
 .../prepare_kto_dataset.sh                    |  14 +
 .../prepare_preference_dataset.sh             |   5 +-
 .../prepare_prompt_dataset.sh                 |   3 +-
 .../prepare_sft_dataset.sh                    |   3 +-
 .../ColossalChat/examples/inference/round.txt | 104 +++
 .../ColossalChat/examples/requirements.txt    |   2 +-
 .../examples/training_scripts/hostfile        |   6 +-
 .../training_scripts/lora_config.json         |   9 +
 .../examples/training_scripts/train_dpo.py    | 101 ++-
 .../examples/training_scripts/train_dpo.sh    |  42 +-
 .../examples/training_scripts/train_kto.py    | 376 ++++++++
 .../examples/training_scripts/train_kto.sh    |  65 ++
 .../examples/training_scripts/train_orpo.py   | 341 ++++++++
 .../examples/training_scripts/train_orpo.sh   |  64 ++
 .../examples/training_scripts/train_ppo.py    |  41 +-
 .../examples/training_scripts/train_ppo.sh    |   5 +-
 .../examples/training_scripts/train_rm.py     |  79 +-
 .../examples/training_scripts/train_rm.sh     |   9 +-
 .../examples/training_scripts/train_sft.py    | 101 ++-
 .../examples/training_scripts/train_sft.sh    |  35 +-
 applications/ColossalChat/requirements.txt    |   6 +-
 .../generate_dummy_datasets_for_testing.py    |  44 +-
 .../tests/test_data/dpo/test_dpo_data.jsonl   |   2 +-
 .../tests/test_data/kto/test_kto_data.jsonl   |   1 +
 .../tests/test_data/sft/test_sft_data.jsonl   |   2 +-
 .../tests/test_data_preparation.sh            |  53 ++
 applications/ColossalChat/tests/test_lora.py  |  49 +-
 .../ColossalChat/tests/test_templating.sh     |  36 +-
 applications/ColossalChat/tests/test_train.sh | 220 ++++-
 .../ColossalChat/tests/verify_chat_data.py    |   8 +
 .../colossal_eval/dataset/agieval.py          |   4 +-
 .../colossal_eval/dataset/base.py             |  20 +-
 .../colossal_eval/dataset/ceval.py            |   4 +-
 .../colossal_eval/dataset/cmmlu.py            |   4 +-
 .../colossal_eval/dataset/colossalai.py       |   2 +-
 .../colossal_eval/dataset/cvalues.py          |   2 +-
 .../colossal_eval/dataset/gaokaobench.py      |   4 +-
 .../colossal_eval/dataset/longbench.py        |   2 +-
 .../colossal_eval/dataset/mmlu.py             |   4 +-
 .../colossal_eval/dataset/mtbench.py          |   6 +-
 .../colossal_eval/dataset/safetybench_en.py   |   2 +-
 .../colossal_eval/dataset/safetybench_zh.py   |   2 +-
 .../colossal_eval/models/huggingface.py       |  48 +-
 .../colossal_eval/utils/conversation.py       |  12 +-
 .../examples/dataset_evaluation/inference.py  |  54 +-
 .../booster/plugin/hybrid_parallel_plugin.py  |  85 +-
 .../booster/plugin/low_level_zero_plugin.py   |  52 +-
 .../plugin/moe_hybrid_parallel_plugin.py      | 396 +++++----
 .../hybrid_parallel_checkpoint_io.py          |  18 +
 colossalai/checkpoint_io/moe_checkpoint.py    |   5 +-
 colossalai/cluster/dist_coordinator.py        |   2 +-
 colossalai/cluster/process_group_mesh.py      |  12 +-
 colossalai/inference/README.md                |  12 +-
 colossalai/inference/config.py                |  64 +-
 colossalai/inference/core/base_engine.py      |  90 ++
 colossalai/inference/core/diffusion_engine.py | 200 +++++
 colossalai/inference/core/engine.py           | 800 ++----------------
 colossalai/inference/core/llm_engine.py       | 758 +++++++++++++++++
 colossalai/inference/core/request_handler.py  |  51 +-
 .../inference/modeling/layers/diffusion.py    |  54 ++
 .../inference/modeling/layers/distrifusion.py | 626 ++++++++++++++
 .../inference/modeling/models/pixart_alpha.py | 220 +++++
 .../modeling/models/stablediffusion3.py       | 178 ++++
 .../inference/modeling/policy/__init__.py     |   6 +
 .../inference/modeling/policy/pixart_alpha.py |  79 ++
 .../modeling/policy/stablediffusion3.py       |  78 ++
 colossalai/inference/struct.py                |  12 +
 colossalai/inference/utils.py                 |  37 +-
 colossalai/initialize.py                      |   6 +
 .../moe => legacy/moe/layer}/__init__.py      |   0
 .../layer/moe => legacy/moe/layer}/experts.py |  10 +-
 .../layer/moe => legacy/moe/layer}/layers.py  |   4 +-
 .../layer/moe => legacy/moe/layer}/routers.py |  10 +-
 colossalai/{ => legacy}/moe/load_balance.py   |   2 +-
 colossalai/{ => legacy}/moe/manager.py        |   0
 .../legacy/moe}/openmoe/README.md             |   0
 .../moe}/openmoe/benchmark/benchmark_cai.py   |   4 +-
 .../moe}/openmoe/benchmark/benchmark_cai.sh   |   0
 .../openmoe/benchmark/benchmark_cai_dist.sh   |   0
 .../moe}/openmoe/benchmark/benchmark_fsdp.py  |   2 +-
 .../moe}/openmoe/benchmark/benchmark_fsdp.sh  |   0
 .../moe}/openmoe/benchmark/hostfile.txt       |   0
 .../legacy/moe}/openmoe/benchmark/utils.py    |   0
 .../legacy/moe}/openmoe/infer.py              |   0
 .../legacy/moe}/openmoe/infer.sh              |   0
 .../legacy/moe}/openmoe/model/__init__.py     |   0
 .../openmoe/model/convert_openmoe_ckpt.py     |   0
 .../openmoe/model/convert_openmoe_ckpt.sh     |   0
 .../moe}/openmoe/model/modeling_openmoe.py    |   4 +-
 .../moe}/openmoe/model/openmoe_8b_config.json |   0
 .../openmoe/model/openmoe_base_config.json    |   0
 .../moe}/openmoe/model/openmoe_policy.py      |   2 +-
 .../legacy/moe}/openmoe/requirements.txt      |   0
 .../legacy/moe}/openmoe/test_ci.sh            |   0
 .../legacy/moe}/openmoe/train.py              |   2 +-
 .../legacy/moe}/openmoe/train.sh              |   0
 colossalai/{ => legacy}/moe/utils.py          |   2 +-
 .../legacy/nn/layer/parallel_1d/_operation.py |   1 -
 colossalai/moe/__init__.py                    |   5 -
 colossalai/moe/_operation.py                  |  54 +-
 colossalai/pipeline/p2p.py                    |  12 +-
 colossalai/shardformer/layer/__init__.py      |   3 +-
 colossalai/shardformer/layer/_operation.py    |   2 +-
 colossalai/shardformer/layer/attn.py          |  14 +-
 colossalai/shardformer/layer/loss.py          |  45 +-
 .../shardformer/layer/qkv_fused_linear.py     |   1 +
 colossalai/shardformer/modeling/bloom.py      |  56 +-
 colossalai/shardformer/modeling/chatglm2.py   | 221 ++++-
 colossalai/shardformer/modeling/command.py    |  87 +-
 colossalai/shardformer/modeling/deepseek.py   | 754 +++++++++++++++++
 colossalai/shardformer/modeling/gpt2.py       |  47 +-
 colossalai/shardformer/modeling/llama.py      |  75 +-
 colossalai/shardformer/modeling/mistral.py    |  50 +-
 colossalai/shardformer/modeling/mixtral.py    | 479 ++++++++++-
 colossalai/shardformer/modeling/opt.py        |  57 +-
 colossalai/shardformer/modeling/qwen2.py      | 185 ++--
 .../shardformer/policies/auto_policy.py       |  14 +-
 colossalai/shardformer/policies/chatglm2.py   |  40 +-
 colossalai/shardformer/policies/command.py    |   8 -
 colossalai/shardformer/policies/deepseek.py   | 347 ++++++++
 colossalai/shardformer/policies/llama.py      |   9 +-
 colossalai/shardformer/policies/mixtral.py    | 197 ++++-
 colossalai/shardformer/policies/qwen2.py      |  37 +-
 colossalai/shardformer/shard/shard_config.py  |   3 +
 colossalai/shardformer/shard/shardformer.py   |   4 -
 .../tensor/d_tensor/layout_converter.py       |   2 +-
 colossalai/tensor/d_tensor/sharding_spec.py   |  87 +-
 colossalai/tensor/sharding_spec.py            |  89 +-
 .../low_level/bookkeeping/gradient_store.py   |   4 +-
 colossalai/zero/low_level/low_level_optim.py  |  93 +-
 colossalai/zero/low_level/zero_hook.py        |  33 +
 examples/inference/stable_diffusion/README.md |  22 +
 .../stable_diffusion/benchmark_sd3.py         | 179 ++++
 .../stable_diffusion/compute_metric.py        |  80 ++
 .../stable_diffusion/requirements.txt         |   3 +
 .../stable_diffusion/run_benchmark.sh         |  42 +
 .../stable_diffusion/sd3_generation.py        |  81 ++
 .../inference/stable_diffusion/test_ci.sh     |   2 +
 .../gpt/hybridparallelism/finetune.py         |  16 +-
 examples/language/llama/benchmark.py          |   5 +-
 examples/language/opt/opt_benchmark.py        |  24 +-
 examples/language/opt/opt_train_demo.py       |  27 +-
 examples/language/performance_evaluator.py    |   4 +-
 requirements/requirements-test.txt            |   1 -
 requirements/requirements.txt                 |   3 +-
 tests/kit/model_zoo/transformers/__init__.py  |  21 +-
 tests/kit/model_zoo/transformers/deepseek.py  |  83 ++
 tests/kit/model_zoo/transformers/mixtral.py   |  85 ++
 tests/test_legacy/test_moe/moe_utils.py       | 136 +++
 .../test_moe/test_grad_handler.py             |   2 +-
 .../test_moe/test_moe_group.py                |   4 +-
 .../test_moe/test_moe_hybrid_zero.py          |   2 +-
 .../test_moe/test_moe_load_balance.py         |   2 +-
 tests/test_lora/test_lora.py                  |   7 +-
 tests/test_moe/moe_utils.py                   | 150 +---
 tests/test_moe/test_deepseek_layer.py         |  78 ++
 tests/test_moe/test_kernel.py                 |   2 -
 tests/test_moe/test_mixtral_layer.py          |  11 +-
 tests/test_moe/test_moe_checkpoint.py         |  69 +-
 tests/test_moe/test_moe_ep_tp.py              | 338 +++-----
 tests/test_moe/test_moe_ep_zero.py            | 119 +++
 tests/test_moe/test_moe_zero_fwd_bwd_optim.py | 132 ---
 tests/test_shardformer/test_model/_utils.py   |  12 +-
 .../test_model/test_shard_chatglm2.py         |  38 +
 .../test_model/test_shard_command.py          |  40 +
 .../test_model/test_shard_deepseek.py         | 196 +++++
 .../test_model/test_shard_llama.py            |  29 +-
 .../test_model/test_shard_mixtral.py          | 190 +++++
 .../test_model/test_shard_qwen2.py            |  75 +-
 .../test_zero/test_low_level/test_grad_acc.py |   4 +
 .../test_zero/test_low_level/test_zero1_2.py  |   2 +
 version.txt                                   |   2 +-
 208 files changed, 10962 insertions(+), 2892 deletions(-)
 create mode 100755 applications/ColossalChat/benchmarks/benchmark_dpo.sh
 create mode 100755 applications/ColossalChat/benchmarks/benchmark_kto.sh
 create mode 100755 applications/ColossalChat/benchmarks/benchmark_orpo.sh
 create mode 100755 applications/ColossalChat/benchmarks/benchmark_sft.sh
 create mode 100755 applications/ColossalChat/benchmarks/benchmark_simpo.sh
 create mode 100644 applications/ColossalChat/benchmarks/dummy_dataset.py
 create mode 100644 applications/ColossalChat/benchmarks/prepare_dummy_test_dataset.py
 create mode 100755 applications/ColossalChat/coati/trainer/kto.py
 create mode 100644 applications/ColossalChat/coati/trainer/orpo.py
 create mode 100644 applications/ColossalChat/config/conversation_template/Qwen_Qwen1.5-32B-Chat.json
 create mode 100644 applications/ColossalChat/config/conversation_template/tiny-llama.json
 create mode 100755 applications/ColossalChat/examples/data_preparation_scripts/prepare_kto_dataset.sh
 create mode 100644 applications/ColossalChat/examples/inference/round.txt
 create mode 100644 applications/ColossalChat/examples/training_scripts/lora_config.json
 create mode 100755 applications/ColossalChat/examples/training_scripts/train_kto.py
 create mode 100755 applications/ColossalChat/examples/training_scripts/train_kto.sh
 create mode 100755 applications/ColossalChat/examples/training_scripts/train_orpo.py
 create mode 100755 applications/ColossalChat/examples/training_scripts/train_orpo.sh
 create mode 100644 applications/ColossalChat/tests/test_data/kto/test_kto_data.jsonl
 create mode 100644 colossalai/inference/core/base_engine.py
 create mode 100644 colossalai/inference/core/diffusion_engine.py
 create mode 100644 colossalai/inference/core/llm_engine.py
 create mode 100644 colossalai/inference/modeling/layers/diffusion.py
 create mode 100644 colossalai/inference/modeling/layers/distrifusion.py
 create mode 100644 colossalai/inference/modeling/models/pixart_alpha.py
 create mode 100644 colossalai/inference/modeling/models/stablediffusion3.py
 create mode 100644 colossalai/inference/modeling/policy/pixart_alpha.py
 create mode 100644 colossalai/inference/modeling/policy/stablediffusion3.py
 rename colossalai/{shardformer/layer/moe => legacy/moe/layer}/__init__.py (100%)
 rename colossalai/{shardformer/layer/moe => legacy/moe/layer}/experts.py (95%)
 rename colossalai/{shardformer/layer/moe => legacy/moe/layer}/layers.py (99%)
 rename colossalai/{shardformer/layer/moe => legacy/moe/layer}/routers.py (95%)
 rename colossalai/{ => legacy}/moe/load_balance.py (99%)
 rename colossalai/{ => legacy}/moe/manager.py (100%)
 rename {examples/language => colossalai/legacy/moe}/openmoe/README.md (100%)
 rename {examples/language => colossalai/legacy/moe}/openmoe/benchmark/benchmark_cai.py (99%)
 rename {examples/language => colossalai/legacy/moe}/openmoe/benchmark/benchmark_cai.sh (100%)
 rename {examples/language => colossalai/legacy/moe}/openmoe/benchmark/benchmark_cai_dist.sh (100%)
 rename {examples/language => colossalai/legacy/moe}/openmoe/benchmark/benchmark_fsdp.py (98%)
 rename {examples/language => colossalai/legacy/moe}/openmoe/benchmark/benchmark_fsdp.sh (100%)
 rename {examples/language => colossalai/legacy/moe}/openmoe/benchmark/hostfile.txt (100%)
 rename {examples/language => colossalai/legacy/moe}/openmoe/benchmark/utils.py (100%)
 rename {examples/language => colossalai/legacy/moe}/openmoe/infer.py (100%)
 rename {examples/language => colossalai/legacy/moe}/openmoe/infer.sh (100%)
 rename {examples/language => colossalai/legacy/moe}/openmoe/model/__init__.py (100%)
 rename {examples/language => colossalai/legacy/moe}/openmoe/model/convert_openmoe_ckpt.py (100%)
 rename {examples/language => colossalai/legacy/moe}/openmoe/model/convert_openmoe_ckpt.sh (100%)
 rename {examples/language => colossalai/legacy/moe}/openmoe/model/modeling_openmoe.py (99%)
 rename {examples/language => colossalai/legacy/moe}/openmoe/model/openmoe_8b_config.json (100%)
 rename {examples/language => colossalai/legacy/moe}/openmoe/model/openmoe_base_config.json (100%)
 rename {examples/language => colossalai/legacy/moe}/openmoe/model/openmoe_policy.py (99%)
 rename {examples/language => colossalai/legacy/moe}/openmoe/requirements.txt (100%)
 rename {examples/language => colossalai/legacy/moe}/openmoe/test_ci.sh (100%)
 rename {examples/language => colossalai/legacy/moe}/openmoe/train.py (99%)
 rename {examples/language => colossalai/legacy/moe}/openmoe/train.sh (100%)
 rename colossalai/{ => legacy}/moe/utils.py (99%)
 create mode 100644 colossalai/shardformer/modeling/deepseek.py
 create mode 100644 colossalai/shardformer/policies/deepseek.py
 create mode 100644 colossalai/zero/low_level/zero_hook.py
 create mode 100644 examples/inference/stable_diffusion/README.md
 create mode 100644 examples/inference/stable_diffusion/benchmark_sd3.py
 create mode 100644 examples/inference/stable_diffusion/compute_metric.py
 create mode 100644 examples/inference/stable_diffusion/requirements.txt
 create mode 100644 examples/inference/stable_diffusion/run_benchmark.sh
 create mode 100644 examples/inference/stable_diffusion/sd3_generation.py
 create mode 100644 examples/inference/stable_diffusion/test_ci.sh
 create mode 100644 tests/kit/model_zoo/transformers/deepseek.py
 create mode 100644 tests/kit/model_zoo/transformers/mixtral.py
 create mode 100644 tests/test_legacy/test_moe/moe_utils.py
 rename tests/{ => test_legacy}/test_moe/test_grad_handler.py (98%)
 rename tests/{ => test_legacy}/test_moe/test_moe_group.py (95%)
 rename tests/{ => test_legacy}/test_moe/test_moe_hybrid_zero.py (98%)
 rename tests/{ => test_legacy}/test_moe/test_moe_load_balance.py (99%)
 create mode 100644 tests/test_moe/test_deepseek_layer.py
 create mode 100644 tests/test_moe/test_moe_ep_zero.py
 delete mode 100644 tests/test_moe/test_moe_zero_fwd_bwd_optim.py
 create mode 100644 tests/test_shardformer/test_model/test_shard_deepseek.py
 create mode 100644 tests/test_shardformer/test_model/test_shard_mixtral.py

diff --git a/.compatibility b/.compatibility
index d90a74b584d8..4f808740bc02 100644
--- a/.compatibility
+++ b/.compatibility
@@ -1 +1,3 @@
 2.1.0-12.1.0
+2.2.2-12.1.0
+2.3.0-12.1.0
diff --git a/.github/workflows/compatiblity_test_on_dispatch.yml b/.github/workflows/compatiblity_test_on_dispatch.yml
index 3eee564c29ea..1a458d7bbc96 100644
--- a/.github/workflows/compatiblity_test_on_dispatch.yml
+++ b/.github/workflows/compatiblity_test_on_dispatch.yml
@@ -55,41 +55,27 @@ jobs:
     steps:
       - name: Install dependencies
         run: |
-          pip install -U pip setuptools==68.2.2 wheel --user
-      - uses: actions/checkout@v2
-        with:
-          repository: hpcaitech/TensorNVMe
-          ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
-          path: TensorNVMe
-      - name: Install tensornvme
-        run: |
-          cd TensorNVMe
           apt update && apt install -y cmake
-          pip install -r requirements.txt
-          DISABLE_URING=1 pip install -v .
+          pip install -U pip setuptools==68.2.2 wheel --user
+
       - uses: actions/checkout@v2
         with:
           ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
-      - name: Download cub for CUDA 10.2
-        run: |
-          CUDA_VERSION=$(nvcc -V | awk -F ',| ' '/release/{print $6}')
 
-          # check if it is CUDA 10.2
-          # download cub
-          if [ "$CUDA_VERSION" = "10.2" ]; then
-            wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
-            unzip 1.8.0.zip
-            cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
-          fi
       - name: Install Colossal-AI
         run: |
           BUILD_EXT=1 pip install -v .
-          pip install -r requirements/requirements-test.txt
+          pip install --no-cache-dir -r requirements/requirements-test.txt
+
+      - name: Install tensornvme
+        run: |
+          DISABLE_URING=1 pip install -v git+https://github.com/hpcaitech/TensorNVMe.git
+
       - name: Unit Testing
         run: |
           PYTHONPATH=$PWD pytest --durations=0 tests
         env:
           DATA: /data/scratch/cifar-10
-          LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+          LD_LIBRARY_PATH: /github/home/.tensornvme/lib
           LLAMA_PATH: /data/scratch/llama-tiny
           MOE_TENSOR_PATH: /data/scratch/moe_tensors
diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml
index b418c843e7f6..770f4b933156 100644
--- a/.github/workflows/compatiblity_test_on_pr.yml
+++ b/.github/workflows/compatiblity_test_on_pr.yml
@@ -49,42 +49,27 @@ jobs:
     steps:
       - name: Install dependencies
         run: |
-          pip install -U pip setuptools==68.2.2 wheel --user
-      - uses: actions/checkout@v2
-        with:
-          repository: hpcaitech/TensorNVMe
-          ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
-          path: TensorNVMe
-      - name: Install tensornvme
-        run: |
-          cd TensorNVMe
           apt update && apt install -y cmake
-          pip install -r requirements.txt
-          DISABLE_URING=1 pip install -v .
+          pip install -U pip setuptools==68.2.2 wheel --user
+
       - uses: actions/checkout@v2
         with:
           ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
-      - name: Download cub for CUDA 10.2
-        run: |
-          CUDA_VERSION=$(nvcc -V | awk -F ',| ' '/release/{print $6}')
-
-          # check if it is CUDA 10.2
-          # download cub
-          if [ "$CUDA_VERSION" = "10.2" ]; then
-            wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
-            unzip 1.8.0.zip
-            cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
-          fi
 
       - name: Install Colossal-AI
         run: |
           BUILD_EXT=1 pip install -v .
-          pip install -r requirements/requirements-test.txt
+          pip install --no-cache-dir -r requirements/requirements-test.txt
+
+      - name: Install tensornvme
+        run: |
+          DISABLE_URING=1 pip install -v git+https://github.com/hpcaitech/TensorNVMe.git
+
       - name: Unit Testing
         run: |
           PYTHONPATH=$PWD pytest --durations=0 tests
         env:
           DATA: /data/scratch/cifar-10
-          LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+          LD_LIBRARY_PATH: /github/home/.tensornvme/lib
           LLAMA_PATH: /data/scratch/llama-tiny
           MOE_TENSOR_PATH: /data/scratch/moe_tensors
diff --git a/.github/workflows/compatiblity_test_on_schedule.yml b/.github/workflows/compatiblity_test_on_schedule.yml
index 8d98e775c828..c6455604f070 100644
--- a/.github/workflows/compatiblity_test_on_schedule.yml
+++ b/.github/workflows/compatiblity_test_on_schedule.yml
@@ -43,47 +43,28 @@ jobs:
     steps:
       - name: Install dependencies
         run: |
+          apt update && apt install -y cmake
           pip install -U pip setuptools==68.2.2 wheel --user
 
-      - uses: actions/checkout@v2
-        with:
-          repository: hpcaitech/TensorNVMe
-          ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
-          path: TensorNVMe
-
-      - name: Install tensornvme
-        run: |
-          cd TensorNVMe
-          apt update && apt install -y cmake
-          pip install -r requirements.txt
-          DISABLE_URING=1 pip install -v .
       - uses: actions/checkout@v2
         with:
           ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
 
-      - name: Download cub for CUDA 10.2
-        run: |
-          CUDA_VERSION=$(nvcc -V | awk -F ',| ' '/release/{print $6}')
-
-          # check if it is CUDA 10.2
-          # download cub
-          if [ "$CUDA_VERSION" = "10.2" ]; then
-            wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
-            unzip 1.8.0.zip
-            cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
-          fi
-
       - name: Install Colossal-AI
         run: |
           BUILD_EXT=1 pip install -v .
-          pip install -r requirements/requirements-test.txt
+          pip install --no-cache-dir -r requirements/requirements-test.txt
+
+      - name: Install tensornvme
+        run: |
+          DISABLE_URING=1 pip install -v git+https://github.com/hpcaitech/TensorNVMe.git
 
       - name: Unit Testing
         run: |
           PYTHONPATH=$PWD pytest --durations=0 tests
         env:
           DATA: /data/scratch/cifar-10
-          LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+          LD_LIBRARY_PATH: /github/home/.tensornvme/lib
           LLAMA_PATH: /data/scratch/llama-tiny
           MOE_TENSOR_PATH: /data/scratch/moe_tensors
 
diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml
index 4ea86b609267..d0b5c2164119 100644
--- a/.github/workflows/run_chatgpt_examples.yml
+++ b/.github/workflows/run_chatgpt_examples.yml
@@ -52,6 +52,7 @@ jobs:
           mkdir sft_data
           mkdir prompt_data
           mkdir preference_data
+          mkdir kto_data
           ./tests/test_data_preparation.sh
           ./tests/test_train.sh
         env:
@@ -61,3 +62,4 @@ jobs:
           SFT_DATASET: ./sft_data
           PROMPT_DATASET: ./prompt_data
           PREFERENCE_DATASET: ./preference_data
+          KTO_DATASET: ./kto_data
diff --git a/applications/Colossal-LLaMA/prepare_sft_dataset.py b/applications/Colossal-LLaMA/prepare_sft_dataset.py
index a857d6c0c696..fe57907601f6 100644
--- a/applications/Colossal-LLaMA/prepare_sft_dataset.py
+++ b/applications/Colossal-LLaMA/prepare_sft_dataset.py
@@ -10,7 +10,7 @@
 import os
 from multiprocessing import cpu_count
 
-from colossal_llama.dataset.conversation import LLaMA2_Conv
+from colossal_llama.dataset.conversation import LLaMA2_Conv, LLaMA3_Conv
 from colossal_llama.dataset.spliced_and_tokenized_dataset import supervised_tokenize_sft
 from datasets import dataset_dict, load_dataset
 from transformers import AddedToken, AutoTokenizer
@@ -75,6 +75,8 @@ def main():
     # Prepare to the tokenizer.
     tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir)
 
+    default_conversation = LLaMA3_Conv
+
     # Fix </s> split issue: https://github.com/huggingface/transformers/issues/23833
     if args.llama_version == 2:
         tokenizer.add_tokens(AddedToken("</s>", normalized=False, special=True), special_tokens=True)
diff --git a/applications/Colossal-LLaMA/train.py b/applications/Colossal-LLaMA/train.py
index 43a360a9a49c..e74aad33c3e3 100644
--- a/applications/Colossal-LLaMA/train.py
+++ b/applications/Colossal-LLaMA/train.py
@@ -128,6 +128,12 @@ def main() -> None:
     parser.add_argument("--zero", type=int, default=1)
     parser.add_argument("--pad_token", choices=["eos", "unk"], default="eos")
     parser.add_argument("--padding_mode", choices=["max_length", "longest"], default="max_length")
+    parser.add_argument(
+        "--skip_save_each_epoch",
+        action="store_true",
+        default=False,
+        help="skip saving the model checkpoint after each epoch is completed.",
+    )
     args = parser.parse_args()
 
     with open(args.config_file, "w") as f:
@@ -370,11 +376,17 @@ def main() -> None:
                     )
                 total_loss.fill_(0.0)
                 pbar.update()
+
             # Save modeling.
 
-            if (args.save_interval > 0 and (step + 1) % (args.save_interval * args.accumulation_steps) == 0) or (
-                step + 1
-            ) == len(dataloader):
+            save_model_condition = (
+                args.save_interval > 0 and (step + 1) % (args.save_interval * args.accumulation_steps) == 0
+            )
+
+            if not args.skip_save_each_epoch:
+                save_model_condition = save_model_condition or (step + 1) == len(dataloader)
+
+            if save_model_condition:
                 coordinator.print_on_master("\nStart saving model checkpoint with running states")
 
                 if args.use_neft:
diff --git a/applications/ColossalChat/.gitignore b/applications/ColossalChat/.gitignore
index 33950adc0bb5..757cbb5da051 100755
--- a/applications/ColossalChat/.gitignore
+++ b/applications/ColossalChat/.gitignore
@@ -146,6 +146,9 @@ docs/.build
 examples/wandb/
 examples/logs/
 examples/output/
+examples/training_scripts/logs
+examples/training_scripts/wandb
+examples/training_scripts/output
 
 examples/awesome-chatgpt-prompts/
 temp/
diff --git a/applications/ColossalChat/README.md b/applications/ColossalChat/README.md
index 769f0b3d072c..de27ebaf6be1 100755
--- a/applications/ColossalChat/README.md
+++ b/applications/ColossalChat/README.md
@@ -23,6 +23,10 @@
   - [Open QA](#open-qa)
   - [Limitation for LLaMA-finetuned models](#limitation)
   - [Limitation of dataset](#limitation)
+- [Alternative Option For RLHF: DPO](#alternative-option-for-rlhf-direct-preference-optimization)
+- [Alternative Option For RLHF: SimPO](#alternative-option-for-rlhf-simple-preference-optimization-simpo)
+- [Alternative Option For RLHF: ORPO](#alternative-option-for-rlhf-odds-ratio-preference-optimization-orpo)
+- [Alternative Option For RLHF: KTO](#alternative-option-for-rlhf-kahneman-tversky-optimization-kto)
 - [FAQ](#faq)
   - [How to save/load checkpoint](#faq)
   - [How to train with limited resources](#faq)
@@ -135,17 +139,15 @@ The first step in Stage 1 is to collect a dataset of human demonstrations of the
     {"messages":
       [
         {
-          "from": "human",
+          "from": "user",
           "content": "what are some pranks with a pen i can do?"
         },
         {
           "from": "assistant",
           "content": "Are you looking for practical joke ideas?"
         },
-        ...
       ]
     },
-    ...
 ]
 ```
 
@@ -171,23 +173,20 @@ Below shows the preference dataset format used in training the reward model.
           "from": "human",
           "content": "Introduce butterflies species in Oregon."
         }
-      ]
+      ],
       "chosen": [
         {
           "from": "assistant",
           "content": "About 150 species of butterflies live in Oregon, with about 100 species are moths..."
         },
-        ...
       ],
       "rejected": [
         {
           "from": "assistant",
           "content": "Are you interested in just the common butterflies?  There are a few common ones which will be easy to find..."
         },
-        ...
       ]
     },
-    ...
 ]
 ```
 
@@ -216,7 +215,6 @@ PPO uses two kind of training data--- the prompt data and the sft data (optional
           "from": "human",
           "content": "what are some pranks with a pen i can do?"
         }
-        ...
       ]
     },
 ]
@@ -262,9 +260,8 @@ experience buffer size
 = train_batch_size * accumulation_steps * num_tp_group
 ```
 
-## Alternative Option For RLHF: Direct Preference Optimization
-
-For those seeking an alternative to Reinforcement Learning from Human Feedback (RLHF), Direct Preference Optimization (DPO) presents a compelling option. DPO, as detailed in the paper (available at [https://arxiv.org/abs/2305.18290](https://arxiv.org/abs/2305.18290)), DPO offers an low-cost way to perform RLHF and usually request less computation resources compares to PPO.
+## Alternative Option For RLHF: Direct Preference Optimization (DPO)
+For those seeking an alternative to Reinforcement Learning from Human Feedback (RLHF), Direct Preference Optimization (DPO) presents a compelling option. DPO, as detailed in this [paper](https://arxiv.org/abs/2305.18290), DPO offers an low-cost way to perform RLHF and usually request less computation resources compares to PPO. Read this [README](./examples/README.md) for more information.
 
 ### DPO Training Stage1 - Supervised Instructs Tuning
 
@@ -277,6 +274,15 @@ For DPO training, you only need the preference dataset. Please follow the instru
 #### Step 2: Training
 You can run the [train_dpo.sh](./examples/training_scripts/train_dpo.sh) to start DPO training. More detais can be found in [example guideline](./examples/README.md).
 
+## Alternative Option For RLHF: Simple Preference Optimization (SimPO)
+Simple Preference Optimization (SimPO) from this [paper](https://arxiv.org/pdf/2405.14734) is similar to DPO but it abandons the use of the reference model, which makes the training more efficient. It also adds a reward shaping term called target reward margin to enhance training stability. It also use length normalization to better align with the inference process. Read this [README](./examples/README.md) for more information.
+
+## Alternative Option For RLHF: Odds Ratio Preference Optimization (ORPO)
+Odds Ratio Preference Optimization (ORPO) from this [paper](https://arxiv.org/pdf/2403.07691) is a reference model free alignment method that use a mixture of SFT loss and a reinforcement leanring loss calculated based on odds-ratio-based implicit reward to makes the training more efficient and stable. Read this [README](./examples/README.md) for more information.
+
+## Alternative Option For RLHF: Kahneman-Tversky Optimization (KTO)
+We support the method introduced in the paper [KTO:Model Alignment as Prospect Theoretic Optimization](https://arxiv.org/pdf/2402.01306) (KTO). Which is a aligment method that directly maximize "human utility" of generation results. Read this [README](./examples/README.md) for more information.
+
 ### Inference Quantization and Serving - After Training
 
 We provide an online inference server and a benchmark. We aim to run inference on single GPU, so quantization is essential when using large models.
@@ -441,20 +447,6 @@ If you only have a single 24G GPU. Generally, using lora and "zero2-cpu" will be
 If you have multiple GPUs each has very limited VRAM, say 8GB. You can try the `3d` for the plugin option, which supports tensor parellelism, set `--tp` to the number of GPUs that you have.
 </details>
 
-## The Plan
-
-- [x] implement PPO fine-tuning
-- [x] implement training reward model
-- [x] support LoRA
-- [x] support inference
-- [x] support llama from [facebook](https://github.com/facebookresearch/llama)
-- [x] implement PPO-ptx fine-tuning
-- [x] support flash-attention
-- [x] implement DPO fine-tuning
-- [ ] integrate with Ray
-- [ ] support more RL paradigms, like Implicit Language Q-Learning (ILQL),
-- [ ] support chain-of-thought by [langchain](https://github.com/hwchase17/langchain)
-
 ### Real-time progress
 
 You will find our progress in github [project broad](https://github.com/orgs/hpcaitech/projects/17/views/1).
@@ -522,7 +514,7 @@ Coati is developed by ColossalAI Team:
 - [Fazzie](https://fazzie-key.cool/about/index.html) Contributing to the algorithm and development for SFT.
 - [ofey404](https://github.com/ofey404) Contributing to both front-end and back-end development.
 - [Wenhao Chen](https://github.com/CWHer) Contributing to subsequent code enhancements and performance improvements.
-- [Anbang Ye](https://github.com/YeAnbang) Contributing to the refactored version with updated acceleration framework, LoRA, DPO and PPO.
+- [Anbang Ye](https://github.com/YeAnbang) Contributing to the refactored PPO version with updated acceleration framework. Add support for DPO, SimPO, ORPO.
 
 The PhD student from [(HPC-AI) Lab](https://ai.comp.nus.edu.sg/) also contributed a lot to this project.
 - [Zangwei Zheng](https://github.com/zhengzangw)
@@ -572,6 +564,36 @@ We also appreciate the valuable suggestions provided by [Jian Hu](https://github
   journal = {GitHub repository},
   howpublished = {\url{https://github.com/XueFuzhao/InstructionWild}},
 }
+
+@misc{meng2024simposimplepreferenceoptimization,
+      title={SimPO: Simple Preference Optimization with a Reference-Free Reward},
+      author={Yu Meng and Mengzhou Xia and Danqi Chen},
+      year={2024},
+      eprint={2405.14734},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2405.14734},
+}
+
+@misc{rafailov2023directpreferenceoptimizationlanguage,
+      title={Direct Preference Optimization: Your Language Model is Secretly a Reward Model},
+      author={Rafael Rafailov and Archit Sharma and Eric Mitchell and Stefano Ermon and Christopher D. Manning and Chelsea Finn},
+      year={2023},
+      eprint={2305.18290},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/2305.18290},
+}
+
+@misc{hong2024orpomonolithicpreferenceoptimization,
+      title={ORPO: Monolithic Preference Optimization without Reference Model},
+      author={Jiwoo Hong and Noah Lee and James Thorne},
+      year={2024},
+      eprint={2403.07691},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2403.07691},
+}
 ```
 
 ## Licenses
diff --git a/applications/ColossalChat/benchmarks/benchmark_dpo.sh b/applications/ColossalChat/benchmarks/benchmark_dpo.sh
new file mode 100755
index 000000000000..44d821a87fee
--- /dev/null
+++ b/applications/ColossalChat/benchmarks/benchmark_dpo.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
+        tail -n +2 |
+        nl -v 0 |
+        tee /dev/tty |
+        sort -g -k 2 |
+        awk '{print $1}' |
+        head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
+set_n_least_used_CUDA_VISIBLE_DEVICES 4
+
+PROJECT_NAME="dpo"
+PARENT_CONFIG_FILE="./benchmark_config" # Path to a folder to save training config logs
+PRETRAINED_MODEL_PATH="" # huggingface or local model path
+PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path
+BENCHMARK_DATA_DIR="./temp/dpo" # Path to benchmark data
+DATASET_SIZE=320
+
+TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
+FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
+declare -a dataset=(
+    $BENCHMARK_DATA_DIR/arrow/part-0
+)
+
+# Generate dummy test data
+python prepare_dummy_test_dataset.py --data_dir $BENCHMARK_DATA_DIR --dataset_size $DATASET_SIZE --max_length 2048 --data_type preference
+
+
+colossalai run --nproc_per_node 4 --master_port 31313 ../examples/training_scripts/train_dpo.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --dataset ${dataset[@]} \
+    --plugin "zero2_cpu" \
+    --max_epochs 1 \
+    --accumulation_steps 1 \
+    --batch_size 4 \
+    --lr 1e-6 \
+    --beta 0.1 \
+    --mixed_precision "bf16" \
+    --grad_clip 1.0 \
+    --max_length 2048 \
+    --weight_decay 0.01 \
+    --warmup_steps 60 \
+    --grad_checkpoint \
+    --use_flash_attn
diff --git a/applications/ColossalChat/benchmarks/benchmark_kto.sh b/applications/ColossalChat/benchmarks/benchmark_kto.sh
new file mode 100755
index 000000000000..82d3e3421acb
--- /dev/null
+++ b/applications/ColossalChat/benchmarks/benchmark_kto.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
+        tail -n +2 |
+        nl -v 0 |
+        tee /dev/tty |
+        sort -g -k 2 |
+        awk '{print $1}' |
+        head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
+set_n_least_used_CUDA_VISIBLE_DEVICES 4
+
+PROJECT_NAME="kto"
+PARENT_CONFIG_FILE="./benchmark_config" # Path to a folder to save training config logs
+PRETRAINED_MODEL_PATH="" # huggingface or local model path
+PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path
+BENCHMARK_DATA_DIR="./temp/kto" # Path to benchmark data
+DATASET_SIZE=80
+
+TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
+FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
+declare -a dataset=(
+    $BENCHMARK_DATA_DIR/arrow/part-0
+)
+
+# Generate dummy test data
+python prepare_dummy_test_dataset.py --data_dir $BENCHMARK_DATA_DIR --dataset_size $DATASET_SIZE --max_length 2048 --data_type kto
+
+
+colossalai run --nproc_per_node 2 --master_port 31313 ../examples/training_scripts/train_kto.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --dataset ${dataset[@]} \
+    --plugin "zero2_cpu" \
+    --max_epochs 1 \
+    --accumulation_steps 1 \
+    --batch_size 2 \
+    --lr 1e-5 \
+    --beta 0.1 \
+    --mixed_precision "bf16" \
+    --grad_clip 1.0 \
+    --max_length 2048 \
+    --weight_decay 0.01 \
+    --warmup_steps 60 \
+    --grad_checkpoint \
+    --use_flash_attn
diff --git a/applications/ColossalChat/benchmarks/benchmark_orpo.sh b/applications/ColossalChat/benchmarks/benchmark_orpo.sh
new file mode 100755
index 000000000000..f8fb264aeaae
--- /dev/null
+++ b/applications/ColossalChat/benchmarks/benchmark_orpo.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
+        tail -n +2 |
+        nl -v 0 |
+        tee /dev/tty |
+        sort -g -k 2 |
+        awk '{print $1}' |
+        head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
+set_n_least_used_CUDA_VISIBLE_DEVICES 2
+
+PROJECT_NAME="orpo"
+PARENT_CONFIG_FILE="./benchmark_config" # Path to a folder to save training config logs
+PRETRAINED_MODEL_PATH="" # huggingface or local model path
+PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path
+BENCHMARK_DATA_DIR="./temp/orpo" # Path to benchmark data
+DATASET_SIZE=160
+
+TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
+FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
+declare -a dataset=(
+    $BENCHMARK_DATA_DIR/arrow/part-0
+)
+
+# Generate dummy test data
+python prepare_dummy_test_dataset.py --data_dir $BENCHMARK_DATA_DIR --dataset_size $DATASET_SIZE --max_length 2048 --data_type preference
+
+
+colossalai run --nproc_per_node 2 --master_port 31313 ../examples/training_scripts/train_orpo.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --dataset ${dataset[@]} \
+    --plugin "zero2" \
+    --max_epochs 1 \
+    --accumulation_steps 1 \
+    --batch_size 4 \
+    --lr 8e-6 \
+    --lam 0.5 \
+    --mixed_precision "bf16" \
+    --grad_clip 1.0 \
+    --max_length 2048 \
+    --weight_decay 0.01 \
+    --warmup_steps 60 \
+    --grad_checkpoint \
+    --use_flash_attn
diff --git a/applications/ColossalChat/benchmarks/benchmark_sft.sh b/applications/ColossalChat/benchmarks/benchmark_sft.sh
new file mode 100755
index 000000000000..efcd428dd21e
--- /dev/null
+++ b/applications/ColossalChat/benchmarks/benchmark_sft.sh
@@ -0,0 +1,50 @@
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
+        tail -n +2 |
+        nl -v 0 |
+        tee /dev/tty |
+        sort -g -k 2 |
+        awk '{print $1}' |
+        head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
+
+set_n_least_used_CUDA_VISIBLE_DEVICES 4
+
+PROJECT_NAME="sft"
+PARENT_CONFIG_FILE="./benchmark_config" # Path to a folder to save training config logs
+PRETRAINED_MODEL_PATH="" # huggingface or local model path
+PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path
+BENCHMARK_DATA_DIR="./temp/sft" # Path to benchmark data
+DATASET_SIZE=640
+
+TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
+FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
+CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
+declare -a dataset=(
+    $BENCHMARK_DATA_DIR/arrow/part-0
+)
+
+
+# Generate dummy test data
+python prepare_dummy_test_dataset.py --data_dir $BENCHMARK_DATA_DIR --dataset_size $DATASET_SIZE --max_length 2048 --data_type sft
+
+
+# the real batch size for gradient descent is number_of_node_in_hostfile * nproc_per_node * train_batch_size
+colossalai run --nproc_per_node 1 --master_port 31312 ../examples/training_scripts/train_sft.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --dataset ${dataset[@]} \
+    --plugin zero2 \
+    --batch_size 8 \
+    --max_epochs 1 \
+    --accumulation_steps 1 \
+    --lr 5e-5 \
+    --lora_rank 32 \
+    --max_len 2048 \
+    --grad_checkpoint \
+    --use_flash_attn
diff --git a/applications/ColossalChat/benchmarks/benchmark_simpo.sh b/applications/ColossalChat/benchmarks/benchmark_simpo.sh
new file mode 100755
index 000000000000..47dfc8595e74
--- /dev/null
+++ b/applications/ColossalChat/benchmarks/benchmark_simpo.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
+        tail -n +2 |
+        nl -v 0 |
+        tee /dev/tty |
+        sort -g -k 2 |
+        awk '{print $1}' |
+        head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
+set_n_least_used_CUDA_VISIBLE_DEVICES 4
+
+PROJECT_NAME="simpo"
+PARENT_CONFIG_FILE="./benchmark_config" # Path to a folder to save training config logs
+PRETRAINED_MODEL_PATH="" # huggingface or local model path
+PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path
+BENCHMARK_DATA_DIR="./temp/simpo" # Path to benchmark data
+DATASET_SIZE=640
+
+TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
+FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
+declare -a dataset=(
+    $BENCHMARK_DATA_DIR/arrow/part-0
+)
+
+# Generate dummy test data
+python prepare_dummy_test_dataset.py --data_dir $BENCHMARK_DATA_DIR --dataset_size $DATASET_SIZE --max_length 2048 --data_type preference
+
+
+colossalai run --nproc_per_node 4 --master_port 31313 ../examples/training_scripts/train_dpo.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --dataset ${dataset[@]} \
+    --plugin "zero2_cpu" \
+    --loss_type "simpo_loss" \
+    --max_epochs 1 \
+    --accumulation_steps 1 \
+    --batch_size 8 \
+    --lr 1e-6 \
+    --beta 0.1 \
+    --gamma 0.6 \
+    --mixed_precision "bf16" \
+    --grad_clip 1.0 \
+    --max_length 2048 \
+    --weight_decay 0.01 \
+    --warmup_steps 60 \
+    --disable_reference_model \
+    --length_normalization \
+    --grad_checkpoint \
+    --use_flash_attn
diff --git a/applications/ColossalChat/benchmarks/dummy_dataset.py b/applications/ColossalChat/benchmarks/dummy_dataset.py
new file mode 100644
index 000000000000..9af0f164173f
--- /dev/null
+++ b/applications/ColossalChat/benchmarks/dummy_dataset.py
@@ -0,0 +1,30 @@
+from typing import Callable
+
+from torch.utils.data import Dataset
+
+
+class DummyLLMDataset(Dataset):
+    def __init__(self, keys, seq_len, size=500, gen_fn={}):
+        self.keys = keys
+        self.gen_fn = gen_fn
+        self.seq_len = seq_len
+        self.data = self._generate_data()
+        self.size = size
+
+    def _generate_data(self):
+        data = {}
+        for key in self.keys:
+            if key in self.gen_fn:
+                data[key] = self.gen_fn[key]
+            else:
+                data[key] = [1] * self.seq_len
+        return data
+
+    def __len__(self):
+        return self.size
+
+    def __getitem__(self, idx):
+        return {
+            key: self.data[key] if not isinstance(self.data[key], Callable) else self.data[key](idx)
+            for key in self.keys
+        }
diff --git a/applications/ColossalChat/benchmarks/prepare_dummy_test_dataset.py b/applications/ColossalChat/benchmarks/prepare_dummy_test_dataset.py
new file mode 100644
index 000000000000..f501c53582e6
--- /dev/null
+++ b/applications/ColossalChat/benchmarks/prepare_dummy_test_dataset.py
@@ -0,0 +1,105 @@
+import argparse
+import json
+import os
+import time
+from multiprocessing import cpu_count
+
+from datasets import load_dataset
+from dummy_dataset import DummyLLMDataset
+
+from colossalai.logging import get_dist_logger
+
+logger = get_dist_logger()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--data_dir",
+        type=str,
+        required=True,
+        default=None,
+        help="The output dir",
+    )
+    parser.add_argument(
+        "--dataset_size",
+        type=int,
+        required=True,
+        default=None,
+        help="The size of data",
+    )
+    parser.add_argument(
+        "--max_length",
+        type=int,
+        required=True,
+        default=None,
+        help="The max length of data",
+    )
+    parser.add_argument(
+        "--data_type",
+        type=str,
+        required=True,
+        default=None,
+        help="The type of data, choose one from ['sft', 'prompt', 'preference', 'kto']",
+    )
+    args = parser.parse_args()
+    if args.data_type == "sft":
+        dataset = DummyLLMDataset(["input_ids", "attention_mask", "labels"], args.max_length, args.dataset_size)
+    elif args.data_type == "prompt":
+        # pass PPO dataset is prepared separately
+        pass
+    elif args.data_type == "preference":
+        dataset = DummyLLMDataset(
+            ["chosen_input_ids", "chosen_loss_mask", "rejected_input_ids", "rejected_loss_mask"],
+            args.max_length,
+            args.dataset_size,
+        )
+    elif args.data_type == "kto":
+        dataset = DummyLLMDataset(
+            ["prompt", "completion", "label"],
+            args.max_length - 512,
+            args.dataset_size,
+            gen_fn={
+                "completion": lambda x: [1] * 512,
+                "label": lambda x: x % 2,
+            },
+        )
+    else:
+        raise ValueError(f"Unknown data type {args.data_type}")
+
+    # Save each jsonl spliced dataset.
+    output_index = "0"
+    output_name = f"part-{output_index}"
+    os.makedirs(args.data_dir, exist_ok=True)
+    output_jsonl_path = os.path.join(args.data_dir, "json")
+    output_arrow_path = os.path.join(args.data_dir, "arrow")
+    output_cache_path = os.path.join(args.data_dir, "cache")
+    os.makedirs(output_jsonl_path, exist_ok=True)
+    os.makedirs(output_arrow_path, exist_ok=True)
+    output_jsonl_file_path = os.path.join(output_jsonl_path, output_name + ".jsonl")
+    st = time.time()
+    with open(file=output_jsonl_file_path, mode="w", encoding="utf-8") as fp_writer:
+        count = 0
+        for i in range(len(dataset)):
+            data_point = dataset[i]
+            if count % 500 == 0:
+                logger.info(f"processing {count} spliced data points for {fp_writer.name}")
+            count += 1
+            fp_writer.write(json.dumps(data_point, ensure_ascii=False) + "\n")
+    logger.info(
+        f"Current file {fp_writer.name}; "
+        f"Data size: {len(dataset)}; "
+        f"Time cost: {round((time.time() - st) / 60, 6)} minutes."
+    )
+    # Save each arrow spliced dataset
+    output_arrow_file_path = os.path.join(output_arrow_path, output_name)
+    logger.info(f"Start to save {output_arrow_file_path}")
+    dataset = load_dataset(
+        path="json",
+        data_files=[output_jsonl_file_path],
+        cache_dir=os.path.join(output_cache_path, "tokenized"),
+        keep_in_memory=False,
+        num_proc=cpu_count(),
+        split="train",
+    )
+    dataset.save_to_disk(dataset_path=output_arrow_file_path, num_proc=min(len(dataset), cpu_count()))
diff --git a/applications/ColossalChat/coati/dataset/__init__.py b/applications/ColossalChat/coati/dataset/__init__.py
index deb7b6d926fb..8e9060a1a1f9 100755
--- a/applications/ColossalChat/coati/dataset/__init__.py
+++ b/applications/ColossalChat/coati/dataset/__init__.py
@@ -1,24 +1,26 @@
 from .conversation import Conversation, setup_conversation_template
 from .loader import (
+    DataCollatorForKTODataset,
     DataCollatorForPreferenceDataset,
     DataCollatorForPromptDataset,
     DataCollatorForSupervisedDataset,
     StatefulDistributedSampler,
     load_tokenized_dataset,
 )
-from .tokenization_utils import supervised_tokenize_sft, tokenize_prompt_dataset, tokenize_rlhf
+from .tokenization_utils import tokenize_kto, tokenize_prompt, tokenize_rlhf, tokenize_sft
 
 __all__ = [
-    "tokenize_prompt_dataset",
+    "tokenize_prompt",
     "DataCollatorForPromptDataset",
     "is_rank_0",
     "DataCollatorForPreferenceDataset",
     "DataCollatorForSupervisedDataset",
+    "DataCollatorForKTODataset",
     "StatefulDistributedSampler",
     "load_tokenized_dataset",
-    "supervised_tokenize_pretrain",
-    "supervised_tokenize_sft",
+    "tokenize_sft",
     "tokenize_rlhf",
+    "tokenize_kto",
     "setup_conversation_template",
     "Conversation",
 ]
diff --git a/applications/ColossalChat/coati/dataset/conversation.py b/applications/ColossalChat/coati/dataset/conversation.py
index 37900f3b8d64..a77c220d34af 100755
--- a/applications/ColossalChat/coati/dataset/conversation.py
+++ b/applications/ColossalChat/coati/dataset/conversation.py
@@ -18,6 +18,7 @@ class Conversation:
     chat_template: str
     stop_ids: List[int]
     end_of_assistant: str
+    roles = ["user", "assistant"]
 
     @classmethod
     def from_config(cls, tokenizer: PreTrainedTokenizer, config: Dict):
@@ -85,7 +86,7 @@ def append_message(self, role: str, message: str):
         Raises:
             AssertionError: If the role is not 'user' or 'assistant'.
         """
-        assert role in ["user", "assistant"]
+        assert role in self.roles
         self.messages.append({"role": role, "content": message})
 
     def copy(self):
diff --git a/applications/ColossalChat/coati/dataset/loader.py b/applications/ColossalChat/coati/dataset/loader.py
index a0cd17bb47fe..b92cd76adc38 100755
--- a/applications/ColossalChat/coati/dataset/loader.py
+++ b/applications/ColossalChat/coati/dataset/loader.py
@@ -28,6 +28,8 @@ def load_tokenized_dataset(
     Each instance of dataset is a dictionary with
     `{'input_ids': List[int], 'labels': List[int], sequence: str}` format.
     """
+    if not dataset_paths:
+        return None
     mode_map = kwargs.get("mode_map", {"train": "train", "dev": "validation", "test": "test"})
     assert mode in tuple(mode_map), f"Unsupported mode {mode}, it must be in {tuple(mode_map)}"
 
@@ -233,6 +235,91 @@ def __call__(self, instances: Sequence[Dict[str, List[int]]]) -> Dict[str, torch
         )
 
 
+@dataclass
+class DataCollatorForKTODataset(object):
+    """
+    Collate instances for kto dataset.
+    Each input instance is a tokenized dictionary with fields
+    `prompt`(List[int]), `completion`(List[int]) and `label`(bool).
+    Each output instance is a tokenized dictionary with fields
+    `kl_input_ids`(List[int]), `kl_attention_mask`(List[int]) and `kl_loss_mask`(List[int]).
+    `input_ids`(List[int]), `attention_mask`(List[int]), `loss_mask`(List[int]) and `label`(bool).
+    """
+
+    tokenizer: PreTrainedTokenizer
+    max_length: int = 4096
+    ignore_index: int = -100
+
+    def __call__(self, instances: Sequence[Dict[str, List[int]]]) -> Dict[str, torch.Tensor]:
+        """
+
+        Args:
+            instances (`Sequence[Dict[str, List[int]]]`):
+                Mini-batch samples, each sample is stored in an individual dictionary contains the following fields:
+                `prompt`(List[int]), `completion`(List[int]) and `label`(bool, if the sample is desirable or not).
+
+        Returns:
+            (`Dict[str, torch.Tensor]`): Contains the following `torch.Tensor`:
+                `input_ids`: `torch.Tensor` of shape (bsz, max_len);
+                `attention_mask`: `torch.BoolTensor` of shape (bsz, max_len);
+                `labels`: `torch.Tensor` of shape (bsz, max_len), which contains `IGNORE_INDEX`.
+        """
+        assert isinstance(self.tokenizer.pad_token_id, int) and self.tokenizer.pad_token_id >= 0, (
+            f"`{self.tokenizer.__class__.__name__}.pad_token_id` must be a valid non-negative integer index value, "
+            f"but now `{self.tokenizer.pad_token_id}`"
+        )
+        # prepare the preference data
+        prompt = [torch.LongTensor(instance["prompt"]) for instance in instances]
+        prompt_zeros = [torch.zeros_like(t) for t in prompt]
+        completion = [torch.LongTensor(instance["completion"]) for instance in instances]
+        completion_ones = [torch.ones_like(t) for t in completion]
+        label = [torch.tensor(instance["label"], dtype=torch.bool) for instance in instances]
+        input_ids = [torch.cat([prompt[i], completion[i]], dim=-1) for i in range(len(instances))]
+        loss_mask = [torch.cat([prompt_zeros[i], completion_ones[i]], dim=-1) for i in range(len(instances))]
+        # right padding
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            sequences=input_ids,
+            batch_first=True,
+            padding_value=self.tokenizer.pad_token_id,
+        )  # (bsz, max_len)
+        loss_mask = torch.nn.utils.rnn.pad_sequence(
+            sequences=loss_mask, batch_first=True, padding_value=0
+        )  # (bsz, max_len)
+        to_pad = self.max_length - input_ids.size(1)
+        input_ids = F.pad(input_ids, (0, to_pad), value=self.tokenizer.pad_token_id)
+        loss_mask = F.pad(loss_mask, (0, to_pad), value=0)
+        attention_mask = input_ids.ne(self.tokenizer.pad_token_id)  # `torch.BoolTensor`, (bsz, max_len)
+
+        # prepare kt data
+        kl_completion = completion[::-1]  # y'
+        kl_completion_ones = [torch.ones_like(t) for t in kl_completion]
+        kl_input_ids = [torch.cat([prompt[i], kl_completion[i]], dim=-1) for i in range(len(instances))]
+        kl_loss_mask = [torch.cat([prompt_zeros[i], kl_completion_ones[i]], dim=-1) for i in range(len(instances))]
+        # right padding
+        kl_input_ids = torch.nn.utils.rnn.pad_sequence(
+            sequences=kl_input_ids,
+            batch_first=True,
+            padding_value=self.tokenizer.pad_token_id,
+        )  # (bsz, max_len)
+        kl_loss_mask = torch.nn.utils.rnn.pad_sequence(
+            sequences=kl_loss_mask, batch_first=True, padding_value=0
+        )  # (bsz, max_len)
+        to_pad = self.max_length - kl_input_ids.size(1)
+        kl_input_ids = F.pad(kl_input_ids, (0, to_pad), value=self.tokenizer.pad_token_id)
+        kl_loss_mask = F.pad(kl_loss_mask, (0, to_pad), value=0)
+        kl_attention_mask = kl_input_ids.ne(self.tokenizer.pad_token_id)  # `torch.BoolTensor`, (bsz, max_len)
+        data_dict = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "loss_mask": loss_mask,
+            "label": torch.stack(label),
+            "kl_input_ids": kl_input_ids,
+            "kl_attention_mask": kl_attention_mask,
+            "kl_loss_mask": kl_loss_mask,
+        }
+        return data_dict
+
+
 class StatefulDistributedSampler(DistributedSampler):
     def __init__(
         self,
diff --git a/applications/ColossalChat/coati/dataset/tokenization_utils.py b/applications/ColossalChat/coati/dataset/tokenization_utils.py
index 34828cbafcf0..9eb2eba87bf2 100755
--- a/applications/ColossalChat/coati/dataset/tokenization_utils.py
+++ b/applications/ColossalChat/coati/dataset/tokenization_utils.py
@@ -23,11 +23,10 @@
 DSType = Union[Dataset, ConcatDataset, dataset_dict.Dataset]
 
 
-def supervised_tokenize_sft(
+def tokenize_sft(
     data_point: Dict[str, str],
     tokenizer: PreTrainedTokenizer,
     conversation_template: Conversation = None,
-    ignore_index: int = None,
     max_length: int = 4096,
 ) -> Dict[str, Union[int, str, List[int]]]:
     """
@@ -39,51 +38,37 @@ def supervised_tokenize_sft(
 
     Args:
         data_point: the data point of the following format
-            {"messages": [{"from": "human", "content": "xxx"}, {"from": "assistant", "content": "xxx"}]}
+            {"messages": [{"from": "user", "content": "xxx"}, {"from": "assistant", "content": "xxx"}]}
         tokenizer: the tokenizer whose
         conversation_template: the conversation template to apply
         ignore_index: the ignore index when calculate loss during training
         max_length: the maximum context length
     """
 
-    if ignore_index is None:
-        ignore_index = IGNORE_INDEX
+    ignore_index = IGNORE_INDEX
 
     messages = data_point["messages"]
     template = deepcopy(conversation_template)
     template.messages = []
-
-    for mess in messages:
-        from_str = mess["from"]
-        if from_str.lower() == "human":
-            from_str = "user"
-        elif from_str.lower() == "assistant":
-            from_str = "assistant"
-        else:
-            raise ValueError(f"Unsupported role {from_str.lower()}")
-
-        template.append_message(from_str, mess["content"])
+    for idx, mess in enumerate(messages):
+        if mess["from"] != template.roles[idx % 2]:
+            raise ValueError(
+                f"Message should iterate between user and assistant and starts with a \
+                             line from the user. Got the following data:\n{messages}"
+            )
+        template.append_message(mess["from"], mess["content"])
 
     if len(template.messages) % 2 != 0:
+        # Force to end with assistant response
         template.messages = template.messages[0:-1]
 
-    # `target_turn_index` is the number of turns which exceeds `max_length - 1` for the first time.
-    turns = [i for i in range(1, len(messages) // 2 + 1)]
-
-    lo, hi = 0, len(turns)
-    while lo < hi:
-        mid = (lo + hi) // 2
-        if max_length - 1 < len(
-            tokenizer([template.get_prompt(2 * turns[mid] - 1)], add_special_tokens=False)["input_ids"][0]
-        ):
-            hi = mid
-        else:
-            lo = mid + 1
-    target_turn_index = lo
-
-    # The tokenized length for first turn already exceeds `max_length - 1`.
-    if target_turn_index - 1 < 0:
-        warnings.warn("The tokenized length for first turn already exceeds `max_length - 1`.")
+    # tokenize and calculate masked labels -100 for positions corresponding to non-assistant lines
+    prompt = template.get_prompt()
+    chunks, require_loss = split_templated_prompt_into_chunks(
+        template.messages, prompt, conversation_template.end_of_assistant
+    )
+    tokenized, starts, ends = tokenize_and_concatenate(tokenizer, chunks, require_loss, max_length=max_length)
+    if tokenized is None:
         return dict(
             input_ids=None,
             labels=None,
@@ -93,44 +78,18 @@ def supervised_tokenize_sft(
             seq_category=None,
         )
 
-    target_turn = turns[target_turn_index - 1]
-    prompt = template.get_prompt(2 * target_turn)
-    chunks, require_loss = split_templated_prompt_into_chunks(
-        template.messages[: 2 * target_turn], prompt, conversation_template.end_of_assistant
-    )
-    tokenized, starts, ends = tokenize_and_concatenate(tokenizer, chunks, require_loss)
-
     labels = [ignore_index] * len(tokenized)
     for start, end in zip(starts, ends):
-        if end == len(tokenized):
-            tokenized = tokenized + [tokenizer.eos_token_id]
-            labels = labels + [ignore_index]
         labels[start:end] = tokenized[start:end]
 
-    # truncate the sequence at the last token that requires loss calculation
-    to_truncate_len = 0
-    for i in range(len(tokenized) - 1, -1, -1):
-        if labels[i] == ignore_index:
-            to_truncate_len += 1
-        else:
-            break
-    tokenized = tokenized[: len(tokenized) - to_truncate_len]
-    labels = labels[: len(labels) - to_truncate_len]
-
     if tokenizer.bos_token_id is not None:
+        # Force to add bos token at the beginning of the tokenized sequence if the input ids doesn;t starts with bos
         if tokenized[0] != tokenizer.bos_token_id:
+            # Some chat templates already include bos token
             tokenized = [tokenizer.bos_token_id] + tokenized
-            labels = [ignore_index] + labels
-
-    if tokenizer.eos_token_id is not None:
-        # Force to add eos token at the end of the tokenized sequence
-        if tokenized[-1] != tokenizer.eos_token_id:
-            tokenized = tokenized + [tokenizer.eos_token_id]
-            labels = labels + [tokenizer.eos_token_id]
-        else:
-            labels[-1] = tokenizer.eos_token_id
+            labels = [-100] + labels
 
-    # For some model without bos/eos may raise the following errors
+    # log decoded inputs and labels for debugging
     inputs_decode = tokenizer.decode(tokenized)
     start = 0
     end = 0
@@ -167,11 +126,10 @@ def supervised_tokenize_sft(
     )
 
 
-def tokenize_prompt_dataset(
+def tokenize_prompt(
     data_point: Dict[str, str],
     tokenizer: PreTrainedTokenizer,
     conversation_template: Conversation = None,
-    ignore_index: int = None,
     max_length: int = 4096,
 ) -> Dict[str, Union[int, str, List[int]]]:
     """
@@ -179,48 +137,39 @@ def tokenize_prompt_dataset(
         "Something here can be system message[user_line_start]User line[User line end][Assistant line start]Assistant line[Assistant line end]...[Assistant line start]"
     Args:
         data_point: the data point of the following format
-            {"messages": [{"from": "human", "content": "xxx"}, {"from": "assistant", "content": "xxx"}]}
+            {"messages": [{"from": "user", "content": "xxx"}, {"from": "assistant", "content": "xxx"}]}
         tokenizer: the tokenizer whose
         conversation_template: the conversation template to apply
         ignore_index: the ignore index when calculate loss during training
         max_length: the maximum context length
     """
-    if ignore_index is None:
-        ignore_index = IGNORE_INDEX
 
     messages = data_point["messages"]
     template = deepcopy(conversation_template)
     template.messages = []
 
-    for mess in messages:
-        from_str = mess["from"]
-        if from_str.lower() == "human":
-            from_str = "user"
-        elif from_str.lower() == "assistant":
-            from_str = "assistant"
-        else:
-            raise ValueError(f"Unsupported role {from_str.lower()}")
-
-        template.append_message(from_str, mess["content"])
+    for idx, mess in enumerate(messages):
+        if mess["from"] != template.roles[idx % 2]:
+            raise ValueError(
+                f"Message should iterate between user and assistant and starts with a \
+                             line from the user. Got the following data:\n{messages}"
+            )
+        template.append_message(mess["from"], mess["content"])
 
     # `target_turn_index` is the number of turns which exceeds `max_length - 1` for the first time.
-    target_turn = len(template.messages)
-    if target_turn % 2 != 1:
+    if len(template.messages) % 2 != 1:
         # exclude the answer if provided. keep only the prompt
-        target_turn = target_turn - 1
+        template.messages = template.messages[:-1]
 
     # Prepare data
-    prompt = template.get_prompt(target_turn, add_generation_prompt=True)
-    chunks, require_loss = split_templated_prompt_into_chunks(
-        template.messages[:target_turn], prompt, conversation_template.end_of_assistant
-    )
-    tokenized, starts, ends = tokenize_and_concatenate(tokenizer, chunks, require_loss)
+    prompt = template.get_prompt(length=len(template.messages) - 1, add_generation_prompt=True)
+    tokenized = tokenizer([prompt], add_special_tokens=False)["input_ids"][0]
+
     if tokenizer.bos_token_id is not None:
         if tokenized[0] != tokenizer.bos_token_id:
             tokenized = [tokenizer.bos_token_id] + tokenized
 
-    # Skip overlength data
-    if max_length - 1 < len(tokenized):
+    if len(tokenized) > max_length:
         return dict(
             input_ids=None,
             inputs_decode=None,
@@ -231,47 +180,32 @@ def tokenize_prompt_dataset(
     # `inputs_decode` can be used to check whether the tokenization method is true.
     return dict(
         input_ids=tokenized,
-        inputs_decode=tokenizer.decode(tokenized),
+        inputs_decode=prompt,
         seq_length=len(tokenized),
         seq_category=data_point["category"] if "category" in data_point else "None",
     )
 
 
-def apply_rlhf_data_format(
-    template: Conversation, tokenizer: Any, context_len: int, mask_out_target_assistant_line_end=False
-):
+def apply_rlhf_data_format(template: Conversation, tokenizer: Any):
     target_turn = int(len(template.messages) / 2)
     prompt = template.get_prompt(target_turn * 2)
     chunks, require_loss = split_templated_prompt_into_chunks(
         template.messages[: 2 * target_turn], prompt, template.end_of_assistant
     )
-    tokenized, starts, ends = tokenize_and_concatenate(tokenizer, chunks, require_loss)
-    loss_mask = [0] * len(tokenized)
-    mask_token = tokenizer.eos_token_id or tokenizer.pad_token_id
-    if mask_token is None:
-        mask_token = 1  # If the tokenizer doesn't have eos_token or pad_token: Qwen
+    # no truncation applied
+    tokenized, starts, ends = tokenize_and_concatenate(tokenizer, chunks, require_loss, max_length=None)
 
+    loss_mask = [0] * len(tokenized)
     label_decode = []
-    for start, end in zip(starts[-1:], ends[-1:]):
-        # only the last round (chosen/rejected) counts
-        if end == len(tokenized):
-            tokenized = tokenized + [tokenizer.eos_token_id]
-            loss_mask = loss_mask + [1]
-        loss_mask[start:end] = [1] * len(loss_mask[start:end])
-        label_decode.append(tokenizer.decode(tokenized[start:end], skip_special_tokens=False))
+    # only the last round (chosen/rejected) is used to calculate loss
+    for i in range(starts[-1], ends[-1]):
+        loss_mask[i] = 1
+    label_decode.append(tokenizer.decode(tokenized[starts[-1] : ends[-1]], skip_special_tokens=False))
     if tokenizer.bos_token_id is not None:
         if tokenized[0] != tokenizer.bos_token_id:
             tokenized = [tokenizer.bos_token_id] + tokenized
             loss_mask = [0] + loss_mask
 
-    if tokenizer.eos_token_id is not None:
-        # Force to add eos token at the end of the tokenized sequence
-        if tokenized[-1] != tokenizer.eos_token_id:
-            tokenized = tokenized + [tokenizer.eos_token_id]
-            loss_mask = loss_mask + [1]
-        else:
-            loss_mask[-1] = 1
-
     return {"input_ids": tokenized, "loss_mask": loss_mask, "label_decode": label_decode}
 
 
@@ -279,39 +213,29 @@ def tokenize_rlhf(
     data_point: Dict[str, str],
     tokenizer: PreTrainedTokenizer,
     conversation_template: Conversation = None,
-    ignore_index: int = None,
     max_length: int = 4096,
 ) -> Dict[str, Union[int, str, List[int]]]:
     """
     A tokenization function to tokenize an original pretraining data point as following:
-        {"context": [{"from": "human", "content": "xxx"}, {"from": "assistant", "content": "xxx"}],
+        {"context": [{"from": "user", "content": "xxx"}, {"from": "assistant", "content": "xxx"}],
         "chosen": {"from": "assistant", "content": "xxx"}, "rejected": {"from": "assistant", "content": "xxx"}}
     """
-    if ignore_index is None:
-        ignore_index = IGNORE_INDEX
 
     context = data_point["context"]
     template = deepcopy(conversation_template)
     template.clear()
 
-    for mess in context:
-        from_str = mess["from"]
-        if from_str.lower() == "human":
-            from_str = "user"
-        elif from_str.lower() == "assistant":
-            from_str = "assistant"
-        else:
-            raise ValueError(f"Unsupported role {from_str.lower()}")
-
-        if len(template.messages) > 0 and from_str == template.messages[-1]["role"]:
-            # Concate adjacent message from the same role
-            template.messages[-1]["content"] = str(template.messages[-1]["content"] + " " + mess["content"])
-        else:
-            template.append_message(from_str, mess["content"])
+    for idx, mess in enumerate(context):
+        if mess["from"] != template.roles[idx % 2]:
+            raise ValueError(
+                f"Message should iterate between user and assistant and starts with a \
+                             line from the user. Got the following data:\n{context}"
+            )
+        template.append_message(mess["from"], mess["content"])
 
     if len(template.messages) % 2 != 1:
         warnings.warn(
-            "Please make sure leading context starts and ends with a line from human\nLeading context: "
+            "Please make sure leading context starts and ends with a line from user\nLeading context: "
             + str(template.messages)
         )
         return dict(
@@ -322,31 +246,27 @@ def tokenize_rlhf(
             rejected_loss_mask=None,
             rejected_label_decode=None,
         )
-    round_of_context = int((len(template.messages) - 1) / 2)
 
-    assert context[-1]["from"].lower() == "human", "The last message in context should be from human."
+    assert context[-1]["from"].lower() == template.roles[0], "The last message in context should be from user."
     chosen = deepcopy(template)
     rejected = deepcopy(template)
+    chosen_continuation = data_point["chosen"]
+    rejected_continuation = data_point["rejected"]
+    for round in range(len(chosen_continuation)):
+        if chosen_continuation[round]["from"] != template.roles[(round + 1) % 2]:
+            raise ValueError(
+                f"Message should iterate between user and assistant and starts with a \
+                             line from the user. Got the following data:\n{chosen_continuation}"
+            )
+        chosen.append_message(chosen_continuation[round]["from"], chosen_continuation[round]["content"])
 
-    for round in range(len(data_point["chosen"])):
-        from_str = data_point["chosen"][round]["from"]
-        if from_str.lower() == "human":
-            from_str = "user"
-        elif from_str.lower() == "assistant":
-            from_str = "assistant"
-        else:
-            raise ValueError(f"Unsupported role {from_str.lower()}")
-        chosen.append_message(from_str, data_point["chosen"][round]["content"])
-
-    for round in range(len(data_point["rejected"])):
-        from_str = data_point["rejected"][round]["from"]
-        if from_str.lower() == "human":
-            from_str = "user"
-        elif from_str.lower() == "assistant":
-            from_str = "assistant"
-        else:
-            raise ValueError(f"Unsupported role {from_str.lower()}")
-        rejected.append_message(from_str, data_point["rejected"][round]["content"])
+    for round in range(len(rejected_continuation)):
+        if rejected_continuation[round]["from"] != template.roles[(round + 1) % 2]:
+            raise ValueError(
+                f"Message should iterate between user and assistant and starts with a \
+                             line from the user. Got the following data:\n{rejected_continuation}"
+            )
+        rejected.append_message(rejected_continuation[round]["from"], rejected_continuation[round]["content"])
 
     (
         chosen_input_ids,
@@ -356,48 +276,32 @@ def tokenize_rlhf(
         rejected_loss_mask,
         rejected_label_decode,
     ) = (None, None, None, None, None, None)
-    if (
-        len(tokenizer([chosen.get_prompt(len(chosen.messages))], add_special_tokens=False)["input_ids"][0])
-        <= max_length - 1
-        and len(tokenizer([rejected.get_prompt(len(rejected.messages))], add_special_tokens=False)["input_ids"][0])
-        <= max_length - 1
-    ):
-        chosen_data_packed = apply_rlhf_data_format(chosen, tokenizer, round_of_context)
-        (chosen_input_ids, chosen_loss_mask, chosen_label_decode) = (
-            chosen_data_packed["input_ids"],
-            chosen_data_packed["loss_mask"],
-            chosen_data_packed["label_decode"],
-        )
 
-        rejected_data_packed = apply_rlhf_data_format(
-            rejected, tokenizer, round_of_context, mask_out_target_assistant_line_end=True
-        )
-        (rejected_input_ids, rejected_loss_mask, rejected_label_decode) = (
-            rejected_data_packed["input_ids"],
-            rejected_data_packed["loss_mask"],
-            rejected_data_packed["label_decode"],
-        )
+    chosen_data_packed = apply_rlhf_data_format(chosen, tokenizer)
+    (chosen_input_ids, chosen_loss_mask, chosen_label_decode) = (
+        chosen_data_packed["input_ids"],
+        chosen_data_packed["loss_mask"],
+        chosen_data_packed["label_decode"],
+    )
 
-        # Check if loss mask is all 0s (no loss), this may happen when the tokenized length is too long
-        if chosen_loss_mask.count(0) == len(chosen_loss_mask) or rejected_loss_mask.count(0) == len(rejected_loss_mask):
-            return dict(
-                chosen_input_ids=None,
-                chosen_loss_mask=None,
-                chosen_label_decode=None,
-                rejected_input_ids=None,
-                rejected_loss_mask=None,
-                rejected_label_decode=None,
-            )
+    rejected_data_packed = apply_rlhf_data_format(rejected, tokenizer)
+    (rejected_input_ids, rejected_loss_mask, rejected_label_decode) = (
+        rejected_data_packed["input_ids"],
+        rejected_data_packed["loss_mask"],
+        rejected_data_packed["label_decode"],
+    )
 
-        return {
-            "chosen_input_ids": chosen_input_ids,
-            "chosen_loss_mask": chosen_loss_mask,
-            "chosen_label_decode": chosen_label_decode,
-            "rejected_input_ids": rejected_input_ids,
-            "rejected_loss_mask": rejected_loss_mask,
-            "rejected_label_decode": rejected_label_decode,
-        }
-    else:
+    if len(chosen_input_ids) > max_length or len(rejected_input_ids) > max_length:
+        return dict(
+            chosen_input_ids=None,
+            chosen_loss_mask=None,
+            chosen_label_decode=None,
+            rejected_input_ids=None,
+            rejected_loss_mask=None,
+            rejected_label_decode=None,
+        )
+    # Check if loss mask is all 0s (no loss), this may happen when the tokenized length is too long
+    if chosen_loss_mask.count(1) == 0 or rejected_loss_mask.count(1) == 0:
         return dict(
             chosen_input_ids=None,
             chosen_loss_mask=None,
@@ -406,3 +310,71 @@ def tokenize_rlhf(
             rejected_loss_mask=None,
             rejected_label_decode=None,
         )
+
+    return {
+        "chosen_input_ids": chosen_input_ids,
+        "chosen_loss_mask": chosen_loss_mask,
+        "chosen_label_decode": chosen_label_decode,
+        "rejected_input_ids": rejected_input_ids,
+        "rejected_loss_mask": rejected_loss_mask,
+        "rejected_label_decode": rejected_label_decode,
+    }
+
+
+def tokenize_kto(
+    data_point: Dict[str, str],
+    tokenizer: PreTrainedTokenizer,
+    conversation_template: Conversation = None,
+    max_length: int = 4096,
+) -> Dict[str, Union[int, str, List[int]]]:
+    """
+    Tokenize a dataset for KTO training
+    The raw input data is conversation that have the following format
+    {
+        "prompt": [{"from": "user", "content": "xxx"}...],
+        "completion": {"from": "assistant", "content": "xxx"},
+        "label": true/false
+    }
+    It returns three fields
+    The context, which contain the query and the assistant start,
+    the completion, which only contains the assistance's answer,
+    and a binary label, which indicates if the sample is prefered or not
+    """
+    prompt = data_point["prompt"]
+    completion = data_point["completion"]
+    template = deepcopy(conversation_template)
+    template.clear()
+
+    if prompt[0].get("from", None) != "user":
+        raise ValueError("conversation should start with user")
+    if completion.get("from", None) != "assistant":
+        raise ValueError("conversation should end with assistant")
+
+    for mess in prompt:
+        if mess.get("from", None) == "user":
+            template.append_message("user", mess["content"])
+        elif mess.get("from", None) == "assistant":
+            template.append_message("assistant", mess["content"])
+        else:
+            raise ValueError(f"Unsupported role {mess.get('from', None)}")
+    generation_prompt = template.get_prompt(len(prompt), add_generation_prompt=True)
+    template.append_message("assistant", completion["content"])
+    full_prompt = template.get_prompt(len(prompt) + 1, add_generation_prompt=False)
+    tokenized_full_prompt = tokenizer(full_prompt, add_special_tokens=False)["input_ids"]
+    if len(tokenized_full_prompt) + 1 > max_length:
+        return dict(prompt=None, completion=None, label=None, input_id_decode=None, completion_decode=None)
+    tokenized_generation_prompt = tokenizer(generation_prompt, add_special_tokens=False)["input_ids"]
+    tokenized_completion = tokenized_full_prompt[len(tokenized_generation_prompt) :]
+    tokenized_completion = deepcopy(tokenized_completion)
+    if tokenizer.bos_token_id is not None and tokenized_generation_prompt[0] != tokenizer.bos_token_id:
+        tokenized_generation_prompt = [tokenizer.bos_token_id] + tokenized_generation_prompt
+    decoded_full_prompt = tokenizer.decode(tokenized_full_prompt, skip_special_tokens=False)
+    decoded_completion = tokenizer.decode(tokenized_completion, skip_special_tokens=False)
+
+    return {
+        "prompt": tokenized_generation_prompt,
+        "completion": tokenized_completion,
+        "label": data_point["label"],
+        "input_id_decode": decoded_full_prompt,
+        "completion_decode": decoded_completion,
+    }
diff --git a/applications/ColossalChat/coati/dataset/utils.py b/applications/ColossalChat/coati/dataset/utils.py
index f41a4d7724da..42c3191db3a5 100755
--- a/applications/ColossalChat/coati/dataset/utils.py
+++ b/applications/ColossalChat/coati/dataset/utils.py
@@ -88,7 +88,13 @@ def find_first_occurrence_subsequence(seq: torch.Tensor, subseq: torch.Tensor, s
     return -1
 
 
-def tokenize_and_concatenate(tokenizer: PreTrainedTokenizer, text: List[str], require_loss: List[bool]):
+def tokenize_and_concatenate(
+    tokenizer: PreTrainedTokenizer,
+    text: List[str],
+    require_loss: List[bool],
+    max_length: int,
+    discard_non_loss_tokens_at_tail: bool = True,
+):
     """
     Tokenizes a list of texts using the provided tokenizer and concatenates the tokenized outputs.
 
@@ -96,6 +102,13 @@ def tokenize_and_concatenate(tokenizer: PreTrainedTokenizer, text: List[str], re
         tokenizer (PreTrainedTokenizer): The tokenizer to use for tokenization.
         text (List[str]): The list of texts to tokenize.
         require_loss (List[bool]): A list of boolean values indicating whether each text requires loss calculation.
+        max_length: used to truncate the input ids
+        discard_non_loss_tokens_at_tail: whether to discard the non-loss tokens at the tail
+
+    if the first round has already exeeded max length
+    - if the user query already exeeded max length, discard the sample
+    - if only the first assistant response exeeded max length, truncate the response to fit the max length
+    else keep the first several complete rounds of the conversations until max length is reached
 
     Returns:
         Tuple[List[int], List[int], List[int]]: A tuple containing the concatenated tokenized input ids,
@@ -106,10 +119,18 @@ def tokenize_and_concatenate(tokenizer: PreTrainedTokenizer, text: List[str], re
     loss_ends = []
     for s, r in zip(text, require_loss):
         tokenized = tokenizer(s, add_special_tokens=False)["input_ids"]
-        if r:
-            loss_starts.append(len(input_ids))
-            loss_ends.append(len(input_ids) + len(tokenized))
-        input_ids.extend(tokenized)
+        if not max_length or len(input_ids) + len(tokenized) <= max_length or len(loss_ends) == 0:
+            if r:
+                loss_starts.append(len(input_ids))
+                loss_ends.append(len(input_ids) + len(tokenized))
+            input_ids.extend(tokenized)
+    if max_length and loss_starts[0] >= max_length:
+        return None, None, None
+    if discard_non_loss_tokens_at_tail:
+        input_ids = input_ids[: loss_ends[-1]]
+    if max_length:
+        input_ids = input_ids[:max_length]
+        loss_ends[-1] = min(max_length, loss_ends[-1])
     return input_ids, loss_starts, loss_ends
 
 
@@ -125,6 +146,12 @@ def split_templated_prompt_into_chunks(messages: List[Dict[str, str]], prompt: s
             content_length = (
                 prompt.find(end_of_assistant, first_occur + content_length) + len(end_of_assistant) - first_occur
             )
+        # if the tokenized content start with a leading space, we want to keep it in loss calculation
+        # e.g., Assistant: I am saying...
+        # if the tokenized content doesn't start with a leading space, we only need to keep the content in loss calculation
+        # e.g.,
+        # Assistant:   # '\n' as line breaker
+        # I am saying...
         if prompt[first_occur - 1] != " ":
             chunks.append(prompt[start_idx:first_occur])
             chunks.append(prompt[first_occur : first_occur + content_length])
diff --git a/applications/ColossalChat/coati/models/__init__.py b/applications/ColossalChat/coati/models/__init__.py
index 14073207f150..fba0949e3fb8 100755
--- a/applications/ColossalChat/coati/models/__init__.py
+++ b/applications/ColossalChat/coati/models/__init__.py
@@ -1,8 +1,8 @@
 from .base import BaseModel
 from .critic import Critic
 from .generation import generate, generate_streaming, prepare_inputs_fn, update_model_kwargs_fn
-from .lora import convert_to_lora_module
-from .loss import DpoLoss, LogExpLoss, LogSigLoss, PolicyLoss, ValueLoss
+from .lora import LoraConfig, convert_to_lora_module, lora_manager
+from .loss import DpoLoss, KTOLoss, LogExpLoss, LogSigLoss, PolicyLoss, ValueLoss
 from .reward_model import RewardModel
 from .utils import disable_dropout
 
@@ -14,9 +14,11 @@
     "ValueLoss",
     "LogSigLoss",
     "LogExpLoss",
+    "LoraConfig",
+    "lora_manager",
     "convert_to_lora_module",
     "DpoLoss",
-    "generate",
+    "KTOLoss" "generate",
     "generate_streaming",
     "disable_dropout",
     "update_model_kwargs_fn",
diff --git a/applications/ColossalChat/coati/models/base.py b/applications/ColossalChat/coati/models/base.py
index fcea9414b430..cfdffdf289bd 100755
--- a/applications/ColossalChat/coati/models/base.py
+++ b/applications/ColossalChat/coati/models/base.py
@@ -42,7 +42,6 @@ def __init__(self, pretrained: str = None, config: Optional[PretrainedConfig] =
         out = self.model(dummy_input)
         self.last_hidden_state_size = out.last_hidden_state.shape[-1]
         self.model = self.model.cpu()
-        # print("self.last_hidden_state_size: ",self.last_hidden_state_size)
 
     def resize_token_embeddings(self, *args, **kwargs):
         """
diff --git a/applications/ColossalChat/coati/models/lora.py b/applications/ColossalChat/coati/models/lora.py
index 9553b00ff2a8..aa5f6ecf8608 100755
--- a/applications/ColossalChat/coati/models/lora.py
+++ b/applications/ColossalChat/coati/models/lora.py
@@ -5,10 +5,11 @@
 import dataclasses
 import math
 import warnings
-from typing import Optional
+from typing import List, Optional, Union
 
 import loralib as lora
 import torch
+import torch.distributed as dist
 import torch.nn as nn
 import torch.nn.functional as F
 
@@ -18,148 +19,349 @@
 
 
 @dataclasses.dataclass
-class LoRAManager:
-    merge_weights: bool = False
+class LoraManager:
+    able_to_merge: bool = True
 
 
-LORA_MANAGER = LoRAManager()
+lora_manager = LoraManager()
 
 
-class LoraLinear(lora.LoRALayer, nn.Module):
+@dataclasses.dataclass
+class LoraConfig:
+    r: int = 0
+    lora_alpha: int = 32
+    linear_lora_dropout: float = 0.1
+    embedding_lora_dropout: float = 0.0
+    lora_train_bias: str = "none"
+    lora_initialization_method: str = "kaiming_uniform"
+    target_modules: List = None
+
+    @classmethod
+    def from_file(cls, config_file: str):
+        import json
+
+        with open(config_file, "r") as f:
+            config = json.load(f)
+        return cls(**config)
+
+
+class LoraBase(lora.LoRALayer, nn.Module):
+    def __init__(
+        self,
+        r: int = 0,
+        lora_alpha: int = 32,
+        lora_dropout: float = 0.1,
+        lora_initialization_method: str = "kaiming_uniform",
+    ):
+        nn.Module.__init__(self)
+        lora.LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout, merge_weights=False)
+        self.r = r
+        self.lora_alpha = lora_alpha
+        self.lora_dropout = nn.Dropout(lora_dropout)
+        self.merged = False
+        self.lora_initialization_method = lora_initialization_method
+        self.weight = None
+        self.bias = None
+        self.lora_A = None
+        self.lora_B = None
+
+    def reset_parameters(self):
+        if hasattr(self, "lora_A"):
+            if self.lora_initialization_method == "kaiming_uniform" or self.weight.size() != (
+                self.out_features,
+                self.in_features,
+            ):
+                # Initialize A with the default values for nn.Linear and set B to zero.
+                nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
+                nn.init.zeros_(self.lora_B)
+            elif self.lora_initialization_method == "PiSSA":
+                # PiSSA method in this paper: https://arxiv.org/abs/2404.02948
+                # Assume the SVD of the original weights is W = USV^T
+                # Initialize a frozen weight to U[:,r:]S[r:,r:]V^T[:,r:] to store less significent part of W
+                # Only A, B are trainable, which are initialized to S[r:,:r]^0.5V^T[:,:r] and U[:,:r]S[r:,:r] respectively
+                # self.scaling = 1.
+                # SVD
+                U, S, Vh = torch.svd_lowrank(
+                    self.weight.to(torch.float32).data, self.r, niter=4
+                )  # U: [out_features, in_features], S: [in_features], V: [in_features, in_features]
+                # weight_backup = self.weight.clone()
+
+                # Initialize A, B
+                S = S / self.scaling
+                self.lora_B.data = (U @ torch.diag(torch.sqrt(S))).to(torch.float32).contiguous()
+                self.lora_A.data = (torch.diag(torch.sqrt(S)) @ Vh.T).to(torch.float32).contiguous()
+                # Initialize weight
+                # To reduce floating point error, we use residual instead of directly using U[:, :self.r] @ S[:self.r] @ Vh[:self.r, :]
+                self.weight.data = (
+                    ((self.weight - self.scaling * self.lora_B @ self.lora_A)).contiguous().to(self.weight.dtype)
+                )
+                self.lora_A.requires_grad = True
+                self.lora_B.requires_grad = True
+            else:
+                raise ValueError(f"Unknown LoRA initialization method {self.lora_initialization_method}")
+
+    def train(self, mode: bool = True):
+        """
+        This function runs when model.train() is invoked. It is used to prepare the linear layer for training
+        """
+
+        self.training = mode
+        if mode and self.merged:
+            warnings.warn("Invoke module.train() would unmerge LoRA weights.")
+            raise NotImplementedError("LoRA unmerge is not tested.")
+        elif not mode and not self.merged and lora_manager.able_to_merge:
+            warnings.warn("Invoke module.eval() would merge LoRA weights.")
+            # Merge the weights and mark it
+            if self.r > 0:
+                self.weight.data += self.lora_B @ self.lora_A * self.scaling
+                delattr(self, "lora_A")
+                delattr(self, "lora_B")
+            self.merged = True
+
+        return self
+
+
+class LoraLinear(LoraBase):
     """Replace in-place ops to out-of-place ops to fit gemini. Convert a torch.nn.Linear to LoraLinear."""
 
     def __init__(
         self,
         weight: nn.Parameter,
-        bias: Optional[nn.Parameter],
+        bias: Union[nn.Parameter, bool],
         r: int = 0,
-        lora_alpha: int = 1,
+        lora_alpha: int = 32,
         lora_dropout: float = 0.0,
-        # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
-        fan_in_fan_out: bool = False,
+        lora_initialization_method: str = "kaiming_uniform",
     ):
-        nn.Module.__init__(self)
-        lora.LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout, merge_weights=False)
+        super().__init__(
+            r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout, lora_initialization_method=lora_initialization_method
+        )
         self.weight = weight
         self.bias = bias
+        if bias is True:
+            self.bias = nn.Parameter(torch.zeros(weight.shape[0]))
+        if bias is not None:
+            self.bias.requires_grad = True
 
         out_features, in_features = weight.shape
         self.in_features = in_features
         self.out_features = out_features
-
-        self.fan_in_fan_out = fan_in_fan_out
+        assert lora_initialization_method in ["kaiming_uniform", "PiSSA"]
+        self.lora_initialization_method = lora_initialization_method
         # Actual trainable parameters
         if r > 0:
-            self.lora_A = nn.Parameter(self.weight.new_zeros((r, in_features)))
-            self.lora_B = nn.Parameter(self.weight.new_zeros((out_features, r)))
+            self.lora_A = nn.Parameter(torch.randn((r, in_features)))
+            self.lora_B = nn.Parameter(torch.randn((out_features, r)))
             self.scaling = self.lora_alpha / self.r
             # Freezing the pre-trained weight matrix
             self.weight.requires_grad = False
         self.reset_parameters()
-        if fan_in_fan_out:
-            self.weight.data = self.weight.data.T
 
-    def reset_parameters(self):
-        if hasattr(self, "lora_A"):
-            # Initialize A with the default values for nn.Linear and set B to zero.
-            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
-            nn.init.zeros_(self.lora_B)
+    def forward(self, x: torch.Tensor):
+        if self.r > 0 and not self.merged:
+            result = F.linear(x, self.weight, bias=self.bias)
+            result = result + (self.lora_dropout(x) @ self.lora_A.t() @ self.lora_B.t()) * self.scaling
+            return result
+        else:
+            return F.linear(x, self.weight, bias=self.bias)
+
+
+class LoraEmbedding(LoraBase):
+    """Replace in-place ops to out-of-place ops to fit gemini. Convert a torch.nn.Linear to LoraLinear."""
+
+    def __init__(
+        self,
+        weight: nn.Parameter,
+        r: int = 0,
+        lora_alpha: int = 32,
+        lora_dropout: float = 0.1,
+        num_embeddings: int = None,
+        embedding_dim: int = None,
+        padding_idx: Optional[int] = None,
+        max_norm: Optional[float] = None,
+        norm_type: float = 2.0,
+        scale_grad_by_freq: bool = False,
+        sparse: bool = False,
+        lora_initialization_method: str = "kaiming_uniform",
+    ):
+        super().__init__(
+            r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout, lora_initialization_method=lora_initialization_method
+        )
+        self.padding_idx = padding_idx
+        self.max_norm = max_norm
+        self.norm_type = norm_type
+        self.scale_grad_by_freq = scale_grad_by_freq
+        self.sparse = sparse
+        self.num_embeddings = num_embeddings
+        self.embedding_dim = embedding_dim
+
+        self.weight = weight
+
+        in_features, out_features = num_embeddings, embedding_dim
+        self.in_features = in_features
+        self.out_features = out_features
+        assert lora_initialization_method in ["kaiming_uniform", "PiSSA"]
+        self.lora_initialization_method = lora_initialization_method
+
+        # Actual trainable parameters
+        if r > 0:
+            self.lora_A = nn.Parameter(torch.randn((r, in_features)))
+            self.lora_B = nn.Parameter(torch.randn((out_features, r)))
+            self.scaling = self.lora_alpha / self.r
+            # Freezing the pre-trained weight matrix
+            self.weight.requires_grad = False
+
+        # reset parameters
+        nn.init.zeros_(self.lora_A)
+        nn.init.normal_(self.lora_B)
+
+    def _embed(self, x: torch.Tensor, weight) -> torch.Tensor:
+        return F.embedding(
+            x,
+            weight,
+            padding_idx=self.padding_idx,
+            max_norm=self.max_norm,
+            norm_type=self.norm_type,
+            scale_grad_by_freq=self.scale_grad_by_freq,
+            sparse=self.sparse,
+        )
+
+    def forward(self, x: torch.Tensor):
+        base_embedding = self._embed(x, self.weight)
+        # base_embedding.requires_grad = True   # force the embedding layer to be trainable for gradient checkpointing
+        if self.r > 0 and not self.merged:
+            lora_A_embedding = self._embed(x, self.lora_A.t())
+            embedding = base_embedding + (lora_A_embedding @ self.lora_B.t()) * self.scaling
+            return embedding
+        else:
+            return base_embedding
 
     def train(self, mode: bool = True):
         """
         This function runs when model.train() is invoked. It is used to prepare the linear layer for training
         """
 
-        def T(w):
-            return w.T if self.fan_in_fan_out else w
-
         self.training = mode
-        if LORA_MANAGER.merge_weights:
-            if mode and self.merged:
-                warnings.warn("Invoke module.train() would unmerge LoRA weights.")
-                raise NotImplementedError("LoRA unmerge is not tested.")
-                # Make sure that the weights are not merged
-                if self.r > 0:
-                    if not hasattr(self, "lora_A") or not hasattr(self, "lora_B"):
-                        # FIXME(csric): temporary fix
-                        self.lora_A = nn.Parameter(self.weight.new_empty((self.r, self.in_features)))
-                        self.lora_B = nn.Parameter(self.weight.new_empty((self.out_features, self.r)))
-                        self.reset_parameters()
-                    else:
-                        self.weight.data -= T(self.lora_B @ self.lora_A) * self.scaling
-                self.merged = False
-            elif not mode and not self.merged:
-                warnings.warn("Invoke module.eval() would merge LoRA weights.")
-                # Merge the weights and mark it
-                if self.r > 0:
-                    self.weight.data += T(self.lora_B @ self.lora_A) * self.scaling
-                    delattr(self, "lora_A")
-                    delattr(self, "lora_B")
-                self.merged = True
+        if mode and self.merged:
+            warnings.warn("Invoke module.train() would unmerge LoRA weights.")
+            raise NotImplementedError("LoRA unmerge is not tested.")
+        elif not mode and not self.merged and lora_manager.able_to_merge:
+            warnings.warn("Invoke module.eval() would merge LoRA weights.")
+            # Merge the weights and mark it
+            if self.r > 0:
+                self.weight.data += self.lora_A.t() @ self.lora_B.t() * self.scaling
+                delattr(self, "lora_A")
+                delattr(self, "lora_B")
+            self.merged = True
 
         return self
 
-    def forward(self, x: torch.Tensor):
-        def T(w):
-            return w.T if self.fan_in_fan_out else w
-
-        if self.r > 0 and not self.merged:
-            result = F.linear(x, T(self.weight), bias=self.bias)
-            if self.r > 0:
-                result = result + (self.lora_dropout(x) @ self.lora_A.t() @ self.lora_B.t()) * self.scaling
-            return result
-        else:
-            return F.linear(x, T(self.weight), bias=self.bias)
-
 
-def _lora_linear_wrapper(linear: nn.Linear, lora_rank: int) -> LoraLinear:
+def _lora_linear_wrapper(linear: nn.Linear, lora_config: LoraConfig) -> LoraLinear:
     """
     Wraps a linear layer with LoRA functionality.
 
     Args:
         linear (nn.Linear): The linear layer to be wrapped.
         lora_rank (int): The rank of the LoRA decomposition.
+        lora_train_bias (str): Whether to train the bias. Can be "none", "all", "lora".
+        lora_initialization_method (str): The initialization method for LoRA. Can be "kaiming_uniform" or "PiSSA".
 
     Returns:
         LoraLinear: The wrapped linear layer with LoRA functionality.
     """
     assert (
-        lora_rank <= linear.in_features
-    ), f"LoRA rank ({lora_rank}) must be less than or equal to in features ({linear.in_features})"
-    lora_linear = LoraLinear(linear.weight, linear.bias, r=lora_rank)
+        lora_config.r <= linear.in_features
+    ), f"LoRA rank ({lora_config.r}) must be less than or equal to in features ({linear.in_features})"
+    bias = None
+    if lora_config.lora_train_bias in ["all", "lora"]:
+        bias = linear.bias
+        if bias is None:
+            bias = True
+    lora_linear = LoraLinear(
+        linear.weight, bias, r=lora_config.r, lora_initialization_method=lora_config.lora_initialization_method
+    )
     return lora_linear
 
 
-def _convert_to_lora_recursively(module: nn.Module, lora_rank: int) -> None:
+def _convert_to_lora_recursively(module: nn.Module, parent_name: str, lora_config: LoraConfig) -> None:
     """
     Recursively converts the given module and its children to LoRA (Low-Rank Approximation) form.
 
     Args:
         module (nn.Module): The module to convert to LoRA form.
         lora_rank (int): The rank of the LoRA approximation.
+        lora_train_bias (str): Whether to train the bias. Can be "none", "all", "lora".
+        parent_name (str): The name of the parent module.
+        lora_initialization_method (str): The initialization method for LoRA. Can be "kaiming_uniform" or "PiSSA".
 
     Returns:
         None
     """
     for name, child in module.named_children():
         if isinstance(child, nn.Linear):
-            setattr(module, name, _lora_linear_wrapper(child, lora_rank))
+            if lora_config.target_modules is None or any(
+                [name in target_module for target_module in lora_config.target_modules]
+            ):
+                if dist.is_initialized() and dist.get_rank() == 0:
+                    logger.info(f"Converting {parent_name}.{name} to LoRA")
+                setattr(module, name, _lora_linear_wrapper(child, lora_config))
+        elif isinstance(child, nn.Embedding):
+            if lora_config.target_modules is None or any(
+                [name in target_module for target_module in lora_config.target_modules]
+            ):
+                if dist.is_initialized() and dist.get_rank() == 0:
+                    logger.info(f"Converting {parent_name}.{name} to LoRA")
+                setattr(
+                    module,
+                    name,
+                    LoraEmbedding(
+                        child.weight,
+                        r=lora_config.r,
+                        lora_alpha=lora_config.lora_alpha,
+                        lora_dropout=lora_config.embedding_lora_dropout,
+                        num_embeddings=child.num_embeddings,
+                        embedding_dim=child.embedding_dim,
+                        padding_idx=child.padding_idx,
+                        max_norm=child.max_norm,
+                        norm_type=child.norm_type,
+                        scale_grad_by_freq=child.scale_grad_by_freq,
+                        sparse=child.sparse,
+                        lora_initialization_method=lora_config.lora_initialization_method,
+                    ),
+                )
         else:
-            _convert_to_lora_recursively(child, lora_rank)
+            _convert_to_lora_recursively(child, f"{parent_name}.{name}", lora_config)
 
 
-def convert_to_lora_module(module: nn.Module, lora_rank: int, lora_train_bias: str = "none") -> nn.Module:
+def convert_to_lora_module(module: nn.Module, lora_config: LoraConfig) -> nn.Module:
     """Convert a torch.nn.Module to a LoRA module.
 
     Args:
         module (nn.Module): The module to convert.
         lora_rank (int): LoRA rank.
+        lora_train_bias (str): Whether to train the bias. Can be "none", "all", "lora".
+        lora_initialization_method (str): The initialization method for LoRA. Can be "kaiming_uniform" or "PiSSA".
 
     Returns:
         nn.Module: The converted module.
     """
-    if lora_rank <= 0:
+    if lora_config.r <= 0:
         return module
-    _convert_to_lora_recursively(module, lora_rank)
-    lora.mark_only_lora_as_trainable(module, lora_train_bias)
+    # make all parameter not trainable, if lora_train_bias is "all", set bias to trainable
+    total_parameter_size = 0
+    for name, p in module.named_parameters():
+        p.requires_grad = False
+        if "bias" in name and lora_config.lora_train_bias == "all":
+            p.requires_grad = True
+        total_parameter_size += p.numel()
+    _convert_to_lora_recursively(module, "", lora_config)
+    trainable_parameter_size = 0
+    for name, p in module.named_parameters():
+        if p.requires_grad == True:
+            trainable_parameter_size += p.numel()
+    if dist.is_initialized() and dist.get_rank() == 0:
+        logger.info(
+            f"Trainable parameter size: {trainable_parameter_size/1024/1024:.2f}M\nOriginal trainable parameter size: {total_parameter_size/1024/1024:.2f}M\nPercentage: {trainable_parameter_size/total_parameter_size*100:.2f}%"
+        )
     return module
diff --git a/applications/ColossalChat/coati/models/loss.py b/applications/ColossalChat/coati/models/loss.py
index e411dded148c..840cca074c39 100755
--- a/applications/ColossalChat/coati/models/loss.py
+++ b/applications/ColossalChat/coati/models/loss.py
@@ -5,6 +5,7 @@
 from typing import Optional, Tuple
 
 import torch
+import torch.distributed as dist
 import torch.nn as nn
 
 from .utils import masked_mean
@@ -89,11 +90,22 @@ class DpoLoss(nn.Module):
     """
     Dpo loss
     Details: https://arxiv.org/pdf/2305.18290.pdf
+
+    SimPO loss:
+    Details: https://arxiv.org/pdf/2405.14734.pdf
     """
 
-    def __init__(self, beta: float = 0.1):
+    def __init__(self, beta: float = 0.1, gamma: float = 0.0):
+        """
+        Args:
+            beta: The temperature parameter in the DPO paper.
+            gamma: The margin parameter in the SimPO paper.
+            length_normalization: Whether to normalize the loss by the length of chosen and rejected responses.
+                Refer to the length normalization in the SimPO paper
+        """
         super().__init__()
         self.beta = beta
+        self.gamma = gamma
 
     def forward(
         self,
@@ -104,7 +116,7 @@ def forward(
         chosen_mask: torch.Tensor,
         reject_mask: torch.Tensor,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """Compute the DPO loss for a batch of policy and reference model log probabilities.
+        """Compute the DPO/SimPO loss for a batch of policy and reference model log probabilities.
 
         # adapted from https://github.com/huggingface/trl/blob/main/trl/trainer/dpo_trainer.py#L328
 
@@ -113,6 +125,8 @@ def forward(
             logprob_actor_reject: Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)
             logprob_ref_chosen: Log probabilities of the reference model for the chosen responses. Shape: (batch_size,)
             logprob_ref_reject: Log probabilities of the reference model for the rejected responses. Shape: (batch_size,)
+            chosen_mask: Mask tensor indicating which responses were chosen. Shape: (batch_size,)
+            reject_mask: Mask tensor indicating which responses were rejected. Shape: (batch_size,)
 
         Returns:
             A tuple of three tensors: (losses, chosen_rewards, rejected_rewards).
@@ -127,13 +141,12 @@ def forward(
             if len(logprob_ref_chosen.shape) == 2:
                 ref_logratios = logprob_ref_chosen.sum(-1) - logprob_ref_reject.sum(-1)
             else:
-                ref_logratios = logprob_ref_chosen.squeeze() - logprob_ref_reject.squeeze()
+                ref_logratios = logprob_ref_chosen - logprob_ref_reject
         else:
             # If no reference model is provided
             ref_logratios = 0.0
-
         pi_logratios = logprob_actor_chosen.sum(-1) - logprob_actor_reject.sum(-1)
-        logits = pi_logratios - ref_logratios
+        logits = pi_logratios - ref_logratios - self.gamma / self.beta
         losses = -torch.nn.functional.logsigmoid(self.beta * logits)
 
         # Calculate rewards for logging
@@ -168,3 +181,93 @@ class LogExpLoss(nn.Module):
     def forward(self, chosen_reward: torch.Tensor, reject_reward: torch.Tensor) -> torch.Tensor:
         loss = torch.log(1 + torch.exp(reject_reward - chosen_reward)).mean()
         return loss
+
+
+class OddsRatioLoss(nn.Module):
+    """
+    Odds Ratio Loss in ORPO
+    Details: https://arxiv.org/pdf/2403.07691
+    """
+
+    def forward(
+        self,
+        chosen_logp: torch.Tensor,
+        reject_logp: torch.Tensor,
+        chosen_loss_mask: torch.Tensor,
+        reject_loss_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        chosen_logp = chosen_logp.to(dtype=torch.float32)
+        reject_logp = reject_logp.to(dtype=torch.float32)
+        chosen_odds = chosen_logp - torch.log(-torch.exp(chosen_logp) + 1.0001)
+        chosen_odds_masked = torch.sum(chosen_odds * chosen_loss_mask.float()) / torch.sum(chosen_loss_mask)
+        reject_odds = reject_logp - torch.log(-torch.exp(reject_logp) + 1.0001)
+        reject_odds_masked = torch.sum(reject_odds * reject_loss_mask.float()) / torch.sum(reject_loss_mask)
+        log_odds_ratio = chosen_odds_masked - reject_odds_masked
+        ratio = torch.log(torch.nn.functional.sigmoid(log_odds_ratio))
+        return ratio.to(dtype=torch.bfloat16), log_odds_ratio
+
+
+class KTOLoss(nn.Module):
+    def __init__(self, beta: float = 0.1, desirable_weight: float = 1.0, undesirable_weight: float = 1.0):
+        """
+        Args:
+            beta: The temperature parameter in the KTO paper.
+            desirable_weight: The weight for the desirable responses.
+            undesirable_weight: The weight for the undesirable
+        """
+        super().__init__()
+        self.beta = beta
+        self.desirable_weight = desirable_weight
+        self.undesirable_weight = undesirable_weight
+
+    def forward(
+        self,
+        chosen_logps: torch.Tensor,
+        rejected_logps: torch.Tensor,
+        kl_logps: torch.Tensor,
+        ref_chosen_logps: torch.Tensor,
+        ref_rejected_logps: torch.Tensor,
+        ref_kl_logps: torch.Tensor,
+    ):
+        """
+        Reference:
+            https://github.com/huggingface/trl/blob/a2adfb836a90d1e37b1253ab43dace05f1241e04/trl/trainer/kto_trainer.py#L585
+
+        Compute the KTO loss for a batch of policy and reference model log probabilities.
+        Args:
+            chosen_logps: Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
+            rejected_logps: Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)
+            kl_logps: KL divergence of the policy model. Shape: (batch_size,)
+            ref_chosen_logps: Log probabilities of the reference model for the chosen responses. Shape: (batch_size,)
+            ref_rejected_logps: Log probabilities of the reference model for the rejected responses. Shape: (batch_size,)
+            ref_kl_logps: KL divergence of the reference model. Shape: (batch_size,)
+            beta: The temperature parameter in the DPO paper.
+            desirable_weight: The weight for the desirable responses.
+            undesirable_weight: The weight for the undesirable responses.
+
+        Refer to the KTO paper for details about hyperparameters https://arxiv.org/pdf/2402.01306
+        """
+        kl = (kl_logps - ref_kl_logps).mean().detach()
+        # all gather
+        dist.all_reduce(kl, op=dist.ReduceOp.SUM)
+        kl = (kl / dist.get_world_size()).clamp(min=0)
+
+        if chosen_logps.shape[0] != 0 and ref_chosen_logps.shape[0] != 0:
+            chosen_logratios = chosen_logps - ref_chosen_logps
+            chosen_losses = 1 - nn.functional.sigmoid(self.beta * (chosen_logratios - kl))
+            chosen_rewards = self.beta * chosen_logratios.detach()
+        else:
+            chosen_losses = torch.Tensor([]).to(kl_logps.device)
+            chosen_rewards = torch.Tensor([]).to(kl_logps.device)
+
+        if rejected_logps.shape[0] != 0 and ref_rejected_logps.shape[0] != 0:
+            rejected_logratios = rejected_logps - ref_rejected_logps
+            rejected_losses = 1 - nn.functional.sigmoid(self.beta * (kl - rejected_logratios))
+            rejected_rewards = self.beta * rejected_logratios.detach()
+        else:
+            rejected_losses = torch.Tensor([]).to(kl_logps.device)
+            rejected_rewards = torch.Tensor([]).to(kl_logps.device)
+
+        losses = torch.cat((self.desirable_weight * chosen_losses, self.undesirable_weight * rejected_losses), 0).mean()
+
+        return losses, chosen_rewards, rejected_rewards, kl
diff --git a/applications/ColossalChat/coati/models/utils.py b/applications/ColossalChat/coati/models/utils.py
index ce672534c28e..8ed8d34010b2 100755
--- a/applications/ColossalChat/coati/models/utils.py
+++ b/applications/ColossalChat/coati/models/utils.py
@@ -89,7 +89,9 @@ def masked_mean(tensor: torch.Tensor, mask: torch.Tensor, dim: int = 1) -> torch
     return mean
 
 
-def calc_masked_log_probs(logits: torch.Tensor, sequences: torch.LongTensor, mask: torch.Tensor) -> torch.Tensor:
+def calc_masked_log_probs(
+    logits: torch.Tensor, sequences: torch.LongTensor, mask: torch.Tensor, length_normalization: bool = False
+) -> torch.Tensor:
     """
     Calculate the masked log probabilities for a given sequence of logits.
 
@@ -103,7 +105,11 @@ def calc_masked_log_probs(logits: torch.Tensor, sequences: torch.LongTensor, mas
     """
     # logits are probabilities of the next token, so we shift them to the left by one
     log_probs = _log_probs_from_logits(logits[:, :-1, :], sequences[:, 1:])
-    return log_probs * mask
+
+    if not length_normalization:
+        return log_probs * mask
+    else:
+        return log_probs * mask / (mask.sum(dim=-1, keepdim=True) + 0.01)
 
 
 def load_json(file_path: Union[str, os.PathLike]) -> Dict[str, Any]:
diff --git a/applications/ColossalChat/coati/trainer/__init__.py b/applications/ColossalChat/coati/trainer/__init__.py
index 2eff8ca7676a..6d0900153e8a 100755
--- a/applications/ColossalChat/coati/trainer/__init__.py
+++ b/applications/ColossalChat/coati/trainer/__init__.py
@@ -1,7 +1,18 @@
 from .base import OLTrainer, SLTrainer
 from .dpo import DPOTrainer
+from .kto import KTOTrainer
+from .orpo import ORPOTrainer
 from .ppo import PPOTrainer
 from .rm import RewardModelTrainer
 from .sft import SFTTrainer
 
-__all__ = ["SLTrainer", "OLTrainer", "RewardModelTrainer", "SFTTrainer", "PPOTrainer", "DPOTrainer"]
+__all__ = [
+    "SLTrainer",
+    "OLTrainer",
+    "RewardModelTrainer",
+    "SFTTrainer",
+    "PPOTrainer",
+    "DPOTrainer",
+    "ORPOTrainer",
+    "KTOTrainer",
+]
diff --git a/applications/ColossalChat/coati/trainer/dpo.py b/applications/ColossalChat/coati/trainer/dpo.py
index cbe7d7ca811a..c7ef2be8f6c4 100755
--- a/applications/ColossalChat/coati/trainer/dpo.py
+++ b/applications/ColossalChat/coati/trainer/dpo.py
@@ -2,6 +2,7 @@
 Dpo trainer
 """
 
+import os
 from typing import Any, Optional
 
 import torch
@@ -25,7 +26,7 @@
 
 class DPOTrainer(SLTrainer):
     """
-        Trainer for PPO algorithm.
+        Trainer for DPO algorithm.
 
     Args:
         actor (Actor): the actor model in ppo algorithm
@@ -53,6 +54,8 @@ def __init__(
         tokenizer: PreTrainedTokenizerBase,
         max_epochs: int = 1,
         beta: float = 0.1,
+        gamma: float = 0.0,
+        length_normalization: bool = False,
         accumulation_steps: int = 1,
         start_epoch: int = 0,
         save_interval: int = 0,
@@ -63,7 +66,7 @@ def __init__(
         self.ref_model = ref_model
         self.actor_scheduler = actor_lr_scheduler
         self.tokenizer = tokenizer
-        self.actor_loss_fn = DpoLoss(beta)
+        self.actor_loss_fn = DpoLoss(beta, gamma)
         self.save_interval = save_interval
         self.coordinator = coordinator
         self.save_dir = save_dir
@@ -71,6 +74,7 @@ def __init__(
         self.accumulation_steps = accumulation_steps
         self.device = get_current_device()
         self.accumulative_meter = AccumulativeMeanMeter()
+        self.length_normalization = length_normalization
 
     def _before_fit(
         self,
@@ -131,18 +135,21 @@ def _train(self, epoch: int):
                 batch["reject_attention_mask"],
                 batch["reject_loss_mask"],
             )
-            reject_loss_mask[:, -1] = False
             batch_size = chosen_input_ids.size()[0]
 
             actor_all_logits = self.model(
                 input_ids=torch.cat([chosen_input_ids, reject_input_ids]),
                 attention_mask=torch.cat([chosen_attention_mask, reject_attention_mask]),
-            )["logits"].to(torch.float32)
+            )["logits"]
             actor_chosen_logits = actor_all_logits[:batch_size]
             actor_reject_logits = actor_all_logits[batch_size:]
-            logprob_actor_chosen = calc_masked_log_probs(actor_chosen_logits, chosen_input_ids, chosen_loss_mask[:, 1:])
+            logprob_actor_chosen = calc_masked_log_probs(
+                actor_chosen_logits, chosen_input_ids, chosen_loss_mask[:, 1:], self.length_normalization
+            )
 
-            logprob_actor_reject = calc_masked_log_probs(actor_reject_logits, reject_input_ids, reject_loss_mask[:, 1:])
+            logprob_actor_reject = calc_masked_log_probs(
+                actor_reject_logits, reject_input_ids, reject_loss_mask[:, 1:], self.length_normalization
+            )
 
             if self.ref_model is not None:
                 self.ref_model.eval()
@@ -150,14 +157,14 @@ def _train(self, epoch: int):
                     ref_all_logits = self.ref_model(
                         input_ids=torch.cat([chosen_input_ids, reject_input_ids]),
                         attention_mask=torch.cat([chosen_attention_mask, reject_attention_mask]),
-                    )["logits"].to(torch.float32)
+                    )["logits"]
                     ref_chosen_logits = ref_all_logits[:batch_size]
                     ref_reject_logits = ref_all_logits[batch_size:]
                     logprob_ref_chosen = calc_masked_log_probs(
-                        ref_chosen_logits, chosen_input_ids, chosen_loss_mask[:, 1:]
+                        ref_chosen_logits, chosen_input_ids, chosen_loss_mask[:, 1:], self.length_normalization
                     )
                     logprob_ref_reject = calc_masked_log_probs(
-                        ref_reject_logits, reject_input_ids, reject_loss_mask[:, 1:]
+                        ref_reject_logits, reject_input_ids, reject_loss_mask[:, 1:], self.length_normalization
                     )
             else:
                 logprob_ref_chosen = None
@@ -219,7 +226,7 @@ def _train(self, epoch: int):
                     )
                 self.accumulative_meter.reset()
 
-                if (self.num_train_step + 1) % self.save_interval == 0:
+                if self.save_dir is not None and (self.num_train_step + 1) % self.save_interval == 0:
                     # save checkpoint
                     self.coordinator.print_on_master("\nStart saving model checkpoint with running states")
                     save_checkpoint(
@@ -283,16 +290,16 @@ def _eval(self, epoch: int):
                 actor_all_logits = self.model(
                     torch.cat([chosen_input_ids, reject_input_ids]),
                     torch.cat([chosen_attention_mask, reject_attention_mask]),
-                )["logits"].to(torch.float32)
+                )["logits"]
                 actor_chosen_logits = actor_all_logits[:batch_size]
                 actor_reject_logits = actor_all_logits[batch_size:]
 
                 logprob_actor_chosen = calc_masked_log_probs(
-                    actor_chosen_logits, chosen_input_ids, chosen_loss_mask[:, 1:]
+                    actor_chosen_logits, chosen_input_ids, chosen_loss_mask[:, 1:], self.length_normalization
                 )
 
                 logprob_actor_reject = calc_masked_log_probs(
-                    actor_reject_logits, reject_input_ids, reject_loss_mask[:, 1:]
+                    actor_reject_logits, reject_input_ids, reject_loss_mask[:, 1:], self.length_normalization
                 )
 
                 self.ref_model.eval()
@@ -300,11 +307,15 @@ def _eval(self, epoch: int):
                 ref_all_logits = self.ref_model(
                     torch.cat([chosen_input_ids, reject_input_ids]),
                     torch.cat([chosen_attention_mask, reject_attention_mask]),
-                )["logits"].to(torch.float32)
+                )["logits"]
                 ref_chosen_logits = ref_all_logits[:batch_size]
                 ref_reject_logits = ref_all_logits[batch_size:]
-                logprob_ref_chosen = calc_masked_log_probs(ref_chosen_logits, chosen_input_ids, chosen_loss_mask[:, 1:])
-                logprob_ref_reject = calc_masked_log_probs(ref_reject_logits, reject_input_ids, reject_loss_mask[:, 1:])
+                logprob_ref_chosen = calc_masked_log_probs(
+                    ref_chosen_logits, chosen_input_ids, chosen_loss_mask[:, 1:], self.length_normalization
+                )
+                logprob_ref_reject = calc_masked_log_probs(
+                    ref_reject_logits, reject_input_ids, reject_loss_mask[:, 1:], self.length_normalization
+                )
 
                 losses, chosen_rewards, rejected_rewards = self.actor_loss_fn(
                     logprob_actor_chosen,
@@ -314,7 +325,7 @@ def _eval(self, epoch: int):
                     chosen_loss_mask[:, 1:],
                     reject_loss_mask[:, 1:],
                 )
-                reward_accuracies = (chosen_rewards > rejected_rewards).float()
+                reward_accuracies = (chosen_rewards > rejected_rewards).float().mean()
                 loss = losses.mean()
                 loss_mean = all_reduce_mean(tensor=loss)
                 chosen_rewards_mean = all_reduce_mean(tensor=chosen_rewards)
@@ -333,4 +344,7 @@ def _eval(self, epoch: int):
         for tag in ["loss", "chosen_rewards", "rejected_rewards", "accuracy", "margin"]:
             msg = msg + f"{tag}: {self.accumulative_meter.get(tag)}\n"
         self.coordinator.print_on_master(msg)
+        os.makedirs(self.save_dir, exist_ok=True)
+        with open(os.path.join(self.save_dir, f"eval_result_epoch{epoch}.txt"), "w") as f:
+            f.write(msg)
         step_bar.close()
diff --git a/applications/ColossalChat/coati/trainer/kto.py b/applications/ColossalChat/coati/trainer/kto.py
new file mode 100755
index 000000000000..8ab0bc66bcf9
--- /dev/null
+++ b/applications/ColossalChat/coati/trainer/kto.py
@@ -0,0 +1,318 @@
+"""
+KTO trainer
+"""
+
+import os
+from typing import Any, Optional
+
+import torch
+import torch.distributed
+from coati.models.loss import KTOLoss
+from coati.models.utils import calc_masked_log_probs
+from coati.trainer.utils import all_reduce_mean
+from coati.utils import AccumulativeMeanMeter, save_checkpoint
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.utils.data import DataLoader
+from tqdm import trange
+from transformers import PreTrainedTokenizerBase
+
+from colossalai.booster import Booster
+from colossalai.cluster import DistCoordinator
+from colossalai.utils import get_current_device
+
+from .base import SLTrainer
+from .utils import is_rank_0, to_device
+
+
+class KTOTrainer(SLTrainer):
+    """
+        Trainer for KTO algorithm.
+
+    Args:
+        actor (Actor): the actor model in ppo algorithm
+        ref_model (Critic): the reference model in ppo algorithm
+        booster (Strategy): the strategy to use for training
+        actor_optim (Optimizer): the optimizer to use for actor model
+        actor_lr_scheduler (_LRScheduler): the lr scheduler to use for actor model
+        tokenizer (PreTrainedTokenizerBase): the tokenizer to use for encoding
+        max_epochs (int, defaults to 1): the max number of epochs to train
+        accumulation_steps (int): the number of steps to accumulate gradients
+        start_epoch (int, defaults to 0): the start epoch, non-zero if resumed from a checkpoint
+        save_interval (int): the interval to save model checkpoints, default to 0, which means no checkpoint will be saved during trainning
+        save_dir (str): the directory to save checkpoints
+        coordinator (DistCoordinator): the coordinator to use for distributed logging
+        beta (float, defaults to 0.1): the beta parameter in kto loss
+        desirable_weight (float, defaults to 1.0): the weight for desirable reward
+        undesirable_weight (float, defaults to 1.0): the weight for undesirable reward
+    """
+
+    def __init__(
+        self,
+        actor: Any,
+        ref_model: Any,
+        booster: Booster,
+        actor_optim: Optimizer,
+        actor_lr_scheduler: _LRScheduler,
+        tokenizer: PreTrainedTokenizerBase,
+        max_epochs: int = 1,
+        beta: float = 0.1,
+        desirable_weight: float = 1.0,
+        undesirable_weight: float = 1.0,
+        accumulation_steps: int = 1,
+        start_epoch: int = 0,
+        save_interval: int = 0,
+        save_dir: str = None,
+        coordinator: DistCoordinator = None,
+    ) -> None:
+        super().__init__(booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, start_epoch=start_epoch)
+        self.ref_model = ref_model
+        self.actor_scheduler = actor_lr_scheduler
+        self.tokenizer = tokenizer
+        self.kto_loss = KTOLoss(beta=beta, desirable_weight=desirable_weight, undesirable_weight=undesirable_weight)
+        self.save_interval = save_interval
+        self.coordinator = coordinator
+        self.save_dir = save_dir
+        self.num_train_step = 0
+        self.accumulation_steps = accumulation_steps
+        self.device = get_current_device()
+        self.accumulative_meter = AccumulativeMeanMeter()
+        self.desirable_weight = desirable_weight
+        self.undesirable_weight = undesirable_weight
+        self.beta = beta
+
+    def _before_fit(
+        self,
+        train_preference_dataloader: DataLoader = None,
+        eval_preference_dataloader: DataLoader = None,
+        log_dir: Optional[str] = None,
+        use_wandb: bool = False,
+    ):
+        """
+        Args:
+            prompt_dataloader (DataLoader): the dataloader to use for prompt data
+            pretrain_dataloader (DataLoader): the dataloader to use for pretrain data
+        """
+        self.train_dataloader = train_preference_dataloader
+        self.eval_dataloader = eval_preference_dataloader
+        self.writer = None
+        if use_wandb and is_rank_0():
+            assert log_dir is not None, "log_dir must be provided when use_wandb is True"
+            import wandb
+
+            self.wandb_run = wandb.init(project="Coati-kto", sync_tensorboard=True)
+        if log_dir is not None and is_rank_0():
+            import os
+            import time
+
+            from torch.utils.tensorboard import SummaryWriter
+
+            log_dir = os.path.join(log_dir, "kto")
+            log_dir = os.path.join(log_dir, time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()))
+            self.writer = SummaryWriter(log_dir=log_dir)
+
+    def _train(self, epoch: int):
+        """
+        Args:
+            epoch int: the number of current epoch
+        """
+        self.model.train()
+        self.accumulative_meter.reset()
+        step_bar = trange(
+            len(self.train_dataloader) // self.accumulation_steps,
+            desc=f"Epoch {epoch + 1}/{self.max_epochs}",
+            disable=not is_rank_0(),
+        )
+        for i, batch in enumerate(self.train_dataloader):
+            batch = to_device(batch, self.device)
+            (input_ids, attention_mask, loss_mask, label, kl_input_ids, kl_attention_mask, kl_loss_mask) = (
+                batch["input_ids"],
+                batch["attention_mask"],
+                batch["loss_mask"],
+                batch["label"],
+                batch["kl_input_ids"],
+                batch["kl_attention_mask"],
+                batch["kl_loss_mask"],
+            )
+            batch_size = input_ids.size()[0]
+
+            # actor logits
+            with torch.no_grad():
+                # calculate KL term with KT data
+                kl_logits = self.model(
+                    input_ids=kl_input_ids,
+                    attention_mask=kl_attention_mask,
+                )["logits"]
+
+            logits = self.model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+            )["logits"]
+
+            logprob = calc_masked_log_probs(logits, input_ids, loss_mask[:, 1:]).sum(-1)
+            kl_logprob = calc_masked_log_probs(kl_logits, kl_input_ids, kl_loss_mask[:, 1:]).sum(-1)
+            chosen_index = [i for i in range(batch_size) if label[i] == 1]
+            rejected_index = [i for i in range(batch_size) if label[i] == 0]
+            chosen_logprob = logprob[chosen_index]
+            rejected_logprob = logprob[rejected_index]
+            with torch.no_grad():
+                ref_kl_logits = self.ref_model(
+                    input_ids=kl_input_ids,
+                    attention_mask=kl_attention_mask,
+                )["logits"]
+                ref_logits = self.ref_model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                )["logits"]
+
+            ref_logprob = calc_masked_log_probs(ref_logits, input_ids, loss_mask[:, 1:]).sum(-1)
+            ref_kl_logprob = calc_masked_log_probs(ref_kl_logits, kl_input_ids, kl_loss_mask[:, 1:]).sum(-1)
+            ref_chosen_logprob = ref_logprob[chosen_index]
+            ref_rejected_logprob = ref_logprob[rejected_index]
+
+            loss, chosen_rewards, rejected_rewards, kl = self.kto_loss(
+                chosen_logprob, rejected_logprob, kl_logprob, ref_chosen_logprob, ref_rejected_logprob, ref_kl_logprob
+            )
+
+            self.booster.backward(loss=loss, optimizer=self.optimizer)
+            if self.num_train_step % self.accumulation_steps == self.accumulation_steps - 1:
+                self.optimizer.step()
+                self.optimizer.zero_grad()
+                self.actor_scheduler.step()
+
+            # sync
+            loss_mean = all_reduce_mean(tensor=loss)
+            chosen_rewards_mean = all_reduce_mean(tensor=chosen_rewards.mean())
+            rejected_rewards_mean = all_reduce_mean(tensor=rejected_rewards.mean())
+            self.accumulative_meter.add("chosen_rewards", chosen_rewards_mean.to(torch.float16).mean().item())
+            self.accumulative_meter.add("rejected_rewards", rejected_rewards_mean.to(torch.float16).mean().item())
+            self.accumulative_meter.add("loss", loss_mean.to(torch.float16).detach().item())
+
+            if i % self.accumulation_steps == self.accumulation_steps - 1:
+                self.num_train_step += 1
+                step_bar.update()
+                # logging
+                if self.writer and is_rank_0():
+                    self.writer.add_scalar("train/loss", self.accumulative_meter.get("loss"), self.num_train_step)
+                    self.writer.add_scalar("train/lr", self.optimizer.param_groups[0]["lr"], self.num_train_step)
+                    self.writer.add_scalar(
+                        "train/chosen_rewards", self.accumulative_meter.get("chosen_rewards"), self.num_train_step
+                    )
+                    self.writer.add_scalar(
+                        "train/rejected_rewards",
+                        self.accumulative_meter.get("rejected_rewards"),
+                        self.num_train_step,
+                    )
+                    self.writer.add_scalar(
+                        "train/margin",
+                        self.accumulative_meter.get("chosen_rewards") - self.accumulative_meter.get("rejected_rewards"),
+                        self.num_train_step,
+                    )
+                self.accumulative_meter.reset()
+
+                if self.save_dir is not None and (self.num_train_step + 1) % self.save_interval == 0:
+                    # save checkpoint
+                    self.coordinator.print_on_master("\nStart saving model checkpoint with running states")
+                    save_checkpoint(
+                        save_dir=self.save_dir,
+                        booster=self.booster,
+                        model=self.model,
+                        optimizer=self.optimizer,
+                        lr_scheduler=self.actor_scheduler,
+                        epoch=epoch,
+                        step=i + 1,
+                        batch_size=batch_size,
+                        coordinator=self.coordinator,
+                    )
+                    self.coordinator.print_on_master(
+                        f"Saved checkpoint at epoch {epoch} step {self.save_interval} at folder {self.save_dir}"
+                    )
+
+        step_bar.close()
+
+    def _eval(self, epoch: int):
+        """
+        Args:
+            epoch int: the number of current epoch
+        """
+        if self.eval_dataloader is None:
+            self.coordinator.print_on_master("No eval dataloader is provided, skip evaluation")
+            return
+        self.model.eval()
+        self.accumulative_meter.reset()
+        step_bar = trange(
+            len(self.train_dataloader) // self.accumulation_steps,
+            desc=f"Epoch {epoch + 1}/{self.max_epochs}",
+            disable=not is_rank_0(),
+        )
+        for i, batch in enumerate(self.train_dataloader):
+            batch = to_device(batch, self.device)
+            (input_ids, attention_mask, loss_mask, label, kl_input_ids, kl_attention_mask, kl_loss_mask) = (
+                batch["input_ids"],
+                batch["attention_mask"],
+                batch["loss_mask"],
+                batch["label"],
+                batch["kl_input_ids"],
+                batch["kl_attention_mask"],
+                batch["kl_loss_mask"],
+            )
+            batch_size = input_ids.size()[0]
+
+            # actor logits
+            with torch.no_grad():
+                # calculate KL term with KT data
+                kl_logits = self.model(
+                    input_ids=kl_input_ids,
+                    attention_mask=kl_attention_mask,
+                )["logits"]
+
+                logits = self.model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                )["logits"]
+
+            logprob = calc_masked_log_probs(logits, input_ids, loss_mask[:, 1:]).sum(-1)
+            kl_logprob = calc_masked_log_probs(kl_logits, kl_input_ids, kl_loss_mask[:, 1:]).sum(-1)
+            chosen_index = [i for i in range(batch_size) if label[i] == 1]
+            rejected_index = [i for i in range(batch_size) if label[i] == 0]
+            chosen_logprob = logprob[chosen_index]
+            rejected_logprob = logprob[rejected_index]
+            with torch.no_grad():
+                ref_kl_logits = self.ref_model(
+                    input_ids=kl_input_ids,
+                    attention_mask=kl_attention_mask,
+                )["logits"]
+
+                ref_logits = self.ref_model(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                )["logits"]
+
+            ref_logprob = calc_masked_log_probs(ref_logits, input_ids, loss_mask[:, 1:]).sum(-1)
+            ref_kl_logprob = calc_masked_log_probs(ref_kl_logits, kl_input_ids, kl_loss_mask[:, 1:]).sum(-1)
+            ref_chosen_logprob = ref_logprob[chosen_index]
+            ref_rejected_logprob = ref_logprob[rejected_index]
+
+            loss, chosen_rewards, rejected_rewards, kl = self.kto_loss(
+                chosen_logprob, rejected_logprob, kl_logprob, ref_chosen_logprob, ref_rejected_logprob, ref_kl_logprob
+            )
+
+            # sync
+            loss_mean = all_reduce_mean(tensor=loss)
+            chosen_rewards_mean = all_reduce_mean(tensor=chosen_rewards.mean())
+            rejected_rewards_mean = all_reduce_mean(tensor=rejected_rewards.mean())
+            self.accumulative_meter.add("chosen_rewards", chosen_rewards_mean.to(torch.float16).mean().item())
+            self.accumulative_meter.add("rejected_rewards", rejected_rewards_mean.to(torch.float16).mean().item())
+            self.accumulative_meter.add("loss", loss_mean.to(torch.float16).detach().item())
+            self.accumulative_meter.add(
+                "margin", (chosen_rewards_mean - rejected_rewards_mean).to(torch.float16).mean().item()
+            )
+            step_bar.update()
+        msg = "Evaluation Result:\n"
+        for tag in ["loss", "chosen_rewards", "rejected_rewards", "margin"]:
+            msg = msg + f"{tag}: {self.accumulative_meter.get(tag)}\n"
+        self.coordinator.print_on_master(msg)
+        os.makedirs(self.save_dir, exist_ok=True)
+        with open(os.path.join(self.save_dir, f"eval_result_epoch{epoch}.txt"), "w") as f:
+            f.write(msg)
+        step_bar.close()
diff --git a/applications/ColossalChat/coati/trainer/orpo.py b/applications/ColossalChat/coati/trainer/orpo.py
new file mode 100644
index 000000000000..b039da4afc30
--- /dev/null
+++ b/applications/ColossalChat/coati/trainer/orpo.py
@@ -0,0 +1,314 @@
+"""
+Orpo trainer
+"""
+
+import os
+from typing import Any, Optional
+
+import torch
+from coati.models.loss import OddsRatioLoss
+from coati.models.utils import calc_masked_log_probs
+from coati.trainer.utils import all_reduce_mean
+from coati.utils import AccumulativeMeanMeter, save_checkpoint
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.utils.data import DataLoader
+from tqdm import trange
+from transformers import PreTrainedTokenizerBase
+
+from colossalai.booster import Booster
+from colossalai.cluster import DistCoordinator
+from colossalai.utils import get_current_device
+
+from .base import SLTrainer
+from .utils import is_rank_0, to_device
+
+
+class ORPOTrainer(SLTrainer):
+    """
+        Trainer for ORPO algorithm.
+
+    Args:
+        actor (Actor): the actor model in ppo algorithm
+        booster (Strategy): the strategy to use for training
+        actor_optim (Optimizer): the optimizer to use for actor model
+        actor_lr_scheduler (_LRScheduler): the lr scheduler to use for actor model
+        tokenizer (PreTrainedTokenizerBase): the tokenizer to use for encoding
+        max_epochs (int, defaults to 1): the max number of epochs to train
+        lam (float, defaults to 0.1): the lambda parameter in ORPO loss
+        accumulation_steps (int): the number of steps to accumulate gradients
+        start_epoch (int, defaults to 0): the start epoch, non-zero if resumed from a checkpoint
+        save_interval (int): the interval to save model checkpoints, default to 0, which means no checkpoint will be saved during trainning
+        save_dir (str): the directory to save checkpoints
+        coordinator (DistCoordinator): the coordinator to use for distributed logging
+    """
+
+    def __init__(
+        self,
+        actor: Any,
+        booster: Booster,
+        actor_optim: Optimizer,
+        actor_lr_scheduler: _LRScheduler,
+        tokenizer: PreTrainedTokenizerBase,
+        max_epochs: int = 1,
+        lam: float = 0.1,
+        accumulation_steps: int = 1,
+        start_epoch: int = 0,
+        save_interval: int = 0,
+        save_dir: str = None,
+        coordinator: DistCoordinator = None,
+    ) -> None:
+        super().__init__(booster, max_epochs=max_epochs, model=actor, optimizer=actor_optim, start_epoch=start_epoch)
+        self.actor_scheduler = actor_lr_scheduler
+        self.tokenizer = tokenizer
+        self.odds_ratio_loss_fn = OddsRatioLoss()
+        self.save_interval = save_interval
+        self.coordinator = coordinator
+        self.save_dir = save_dir
+        self.num_train_step = 0
+        self.lam = lam
+        self.accumulation_steps = accumulation_steps
+        self.device = get_current_device()
+        self.accumulative_meter = AccumulativeMeanMeter()
+
+    def _before_fit(
+        self,
+        train_preference_dataloader: DataLoader = None,
+        eval_preference_dataloader: DataLoader = None,
+        log_dir: Optional[str] = None,
+        use_wandb: bool = False,
+    ):
+        """
+        Args:
+            prompt_dataloader (DataLoader): the dataloader to use for prompt data
+            pretrain_dataloader (DataLoader): the dataloader to use for pretrain data
+        """
+        self.train_dataloader = train_preference_dataloader
+        self.eval_dataloader = eval_preference_dataloader
+        self.writer = None
+        if use_wandb and is_rank_0():
+            assert log_dir is not None, "log_dir must be provided when use_wandb is True"
+            import wandb
+
+            self.wandb_run = wandb.init(project="Coati-orpo", sync_tensorboard=True)
+        if log_dir is not None and is_rank_0():
+            import os
+            import time
+
+            from torch.utils.tensorboard import SummaryWriter
+
+            log_dir = os.path.join(log_dir, "orpo")
+            log_dir = os.path.join(log_dir, time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()))
+            self.writer = SummaryWriter(log_dir=log_dir)
+
+    def _train(self, epoch: int):
+        """
+        Args:
+            epoch int: the number of current epoch
+        """
+        self.model.train()
+        self.accumulative_meter.reset()
+        step_bar = trange(
+            len(self.train_dataloader) // self.accumulation_steps,
+            desc=f"Epoch {epoch + 1}/{self.max_epochs}",
+            disable=not is_rank_0(),
+        )
+        for i, batch in enumerate(self.train_dataloader):
+            batch = to_device(batch, self.device)
+            (
+                chosen_input_ids,
+                chosen_attention_mask,
+                chosen_loss_mask,
+                reject_input_ids,
+                reject_attention_mask,
+                reject_loss_mask,
+            ) = (
+                batch["chosen_input_ids"],
+                batch["chosen_attention_mask"],
+                batch["chosen_loss_mask"],
+                batch["reject_input_ids"],
+                batch["reject_attention_mask"],
+                batch["reject_loss_mask"],
+            )
+            batch_size = chosen_input_ids.size()[0]
+            actor_out = self.model(
+                input_ids=torch.cat([chosen_input_ids, reject_input_ids]),
+                attention_mask=torch.cat([chosen_attention_mask, reject_attention_mask]),
+                labels=torch.cat(
+                    [chosen_input_ids, torch.ones_like(reject_input_ids, dtype=reject_input_ids.dtype) * -100]
+                ),
+            )
+            torch.autograd.set_detect_anomaly(True)
+            actor_all_logits = actor_out["logits"].to(torch.float32)
+            actor_chosen_logits = actor_all_logits[:batch_size]
+            actor_reject_logits = actor_all_logits[batch_size:]
+            logprob_actor_chosen = calc_masked_log_probs(actor_chosen_logits, chosen_input_ids, chosen_loss_mask[:, 1:])
+
+            logprob_actor_reject = calc_masked_log_probs(actor_reject_logits, reject_input_ids, reject_loss_mask[:, 1:])
+            # label_chosen[chosen_loss_mask[:, 1:] == 0] = -100
+            chosen_nll = actor_out["loss"]
+            odds_ratio_loss, log_odds_ratio = self.odds_ratio_loss_fn(
+                logprob_actor_chosen, logprob_actor_reject, chosen_loss_mask[:, 1:], reject_loss_mask[:, 1:]
+            )
+            loss = chosen_nll - odds_ratio_loss * self.lam
+            step_bar.set_description(f"Epoch {epoch + 1}/{self.max_epochs} Loss: {loss.detach().cpu().item():.4f}")
+
+            self.booster.backward(loss=loss, optimizer=self.optimizer)
+            if self.num_train_step % self.accumulation_steps == self.accumulation_steps - 1:
+                self.optimizer.step()
+                self.optimizer.zero_grad()
+                self.actor_scheduler.step()
+
+            chosen_rewards = torch.sum(logprob_actor_chosen) / torch.sum(chosen_loss_mask[:, 1:])
+            rejected_rewards = torch.sum(logprob_actor_reject) / torch.sum(reject_loss_mask[:, 1:])
+            reward_accuracies = torch.sum((log_odds_ratio > 0).float()) / torch.sum(log_odds_ratio != 0)
+
+            # sync
+            loss_mean = all_reduce_mean(tensor=loss)
+            chosen_rewards_mean = all_reduce_mean(tensor=chosen_rewards)
+            rejected_rewards_mean = all_reduce_mean(tensor=rejected_rewards)
+            reward_accuracies_mean = all_reduce_mean(tensor=reward_accuracies)
+            self.accumulative_meter.add("chosen_rewards", chosen_rewards_mean.to(torch.float16).mean().item())
+            self.accumulative_meter.add("rejected_rewards", rejected_rewards_mean.to(torch.float16).mean().item())
+            self.accumulative_meter.add("loss", loss_mean.to(torch.float16).item())
+            self.accumulative_meter.add("log_odds_ratio", log_odds_ratio.to(torch.float16).mean().item())
+            self.accumulative_meter.add("accuracy", reward_accuracies_mean.to(torch.float16).item())
+
+            if i % self.accumulation_steps == self.accumulation_steps - 1:
+                self.num_train_step += 1
+                step_bar.update()
+                # logging
+                if self.writer and is_rank_0():
+                    self.writer.add_scalar("train/loss", self.accumulative_meter.get("loss"), self.num_train_step)
+                    self.writer.add_scalar("train/lr", self.optimizer.param_groups[0]["lr"], self.num_train_step)
+                    self.writer.add_scalar(
+                        "train/chosen_rewards", self.accumulative_meter.get("chosen_rewards"), self.num_train_step
+                    )
+                    self.writer.add_scalar(
+                        "train/rejected_rewards",
+                        self.accumulative_meter.get("rejected_rewards"),
+                        self.num_train_step,
+                    )
+                    self.writer.add_scalar(
+                        "train/margin",
+                        self.accumulative_meter.get("chosen_rewards") - self.accumulative_meter.get("rejected_rewards"),
+                        self.num_train_step,
+                    )
+                    self.writer.add_scalar(
+                        "train/accuracy",
+                        self.accumulative_meter.get("accuracy"),
+                        self.num_train_step,
+                    )
+                    self.writer.add_scalar(
+                        "train/log_odds_ratio",
+                        self.accumulative_meter.get("log_odds_ratio"),
+                        self.num_train_step,
+                    )
+                self.accumulative_meter.reset()
+
+                if self.save_dir is not None and (self.num_train_step + 1) % self.save_interval == 0:
+                    # save checkpoint
+                    self.coordinator.print_on_master("\nStart saving model checkpoint with running states")
+                    save_checkpoint(
+                        save_dir=self.save_dir,
+                        booster=self.booster,
+                        model=self.model,
+                        optimizer=self.optimizer,
+                        lr_scheduler=self.actor_scheduler,
+                        epoch=epoch,
+                        step=i + 1,
+                        batch_size=batch_size,
+                        coordinator=self.coordinator,
+                    )
+                    self.coordinator.print_on_master(
+                        f"Saved checkpoint at epoch {epoch} step {self.save_interval} at folder {self.save_dir}"
+                    )
+
+        step_bar.close()
+
+    def _eval(self, epoch: int):
+        """
+        Args:
+            epoch int: the number of current epoch
+        """
+        if self.eval_dataloader is None:
+            self.coordinator.print_on_master("No eval dataloader is provided, skip evaluation")
+            return
+        self.model.eval()
+        self.coordinator.print_on_master("\nStart evaluation...")
+
+        step_bar = trange(
+            len(self.eval_dataloader),
+            desc=f"Epoch {epoch + 1}/{self.max_epochs}",
+            disable=not is_rank_0(),
+        )
+
+        self.accumulative_meter.reset()
+
+        with torch.no_grad():
+            for i, batch in enumerate(self.eval_dataloader):
+                batch = to_device(batch, self.device)
+                (
+                    chosen_input_ids,
+                    chosen_attention_mask,
+                    chosen_loss_mask,
+                    reject_input_ids,
+                    reject_attention_mask,
+                    reject_loss_mask,
+                ) = (
+                    batch["chosen_input_ids"],
+                    batch["chosen_attention_mask"],
+                    batch["chosen_loss_mask"],
+                    batch["reject_input_ids"],
+                    batch["reject_attention_mask"],
+                    batch["reject_loss_mask"],
+                )
+                batch_size = chosen_input_ids.size()[0]
+                actor_out = self.model(
+                    input_ids=torch.cat([chosen_input_ids, reject_input_ids]),
+                    attention_mask=torch.cat([chosen_attention_mask, reject_attention_mask]),
+                    labels=torch.cat(
+                        [chosen_input_ids, torch.ones_like(reject_input_ids, dtype=reject_input_ids.dtype) * -100]
+                    ),
+                )
+                torch.autograd.set_detect_anomaly(True)
+                actor_all_logits = actor_out["logits"].to(torch.float32)
+                actor_chosen_logits = actor_all_logits[:batch_size]
+                actor_reject_logits = actor_all_logits[batch_size:]
+                logprob_actor_chosen = calc_masked_log_probs(
+                    actor_chosen_logits, chosen_input_ids, chosen_loss_mask[:, 1:]
+                )
+
+                logprob_actor_reject = calc_masked_log_probs(
+                    actor_reject_logits, reject_input_ids, reject_loss_mask[:, 1:]
+                )
+                chosen_nll = actor_out["loss"]
+                odds_ratio_loss, log_odds_ratio = self.odds_ratio_loss_fn(
+                    logprob_actor_chosen, logprob_actor_reject, chosen_loss_mask[:, 1:], reject_loss_mask[:, 1:]
+                )
+                loss = chosen_nll - odds_ratio_loss * self.lam
+                step_bar.set_description(f"Epoch {epoch + 1}/{self.max_epochs} Loss: {loss.detach().cpu().item():.4f}")
+
+                chosen_rewards = torch.sum(logprob_actor_chosen) / torch.sum(chosen_loss_mask[:, 1:])
+                rejected_rewards = torch.sum(logprob_actor_reject) / torch.sum(reject_loss_mask[:, 1:])
+                reward_accuracies = torch.sum((log_odds_ratio > 0).float()) / torch.sum(log_odds_ratio != 0)
+
+                # sync
+                loss_mean = all_reduce_mean(tensor=loss)
+                chosen_rewards_mean = all_reduce_mean(tensor=chosen_rewards)
+                rejected_rewards_mean = all_reduce_mean(tensor=rejected_rewards)
+                reward_accuracies_mean = all_reduce_mean(tensor=reward_accuracies)
+                self.accumulative_meter.add("chosen_rewards", chosen_rewards_mean.to(torch.float16).mean().item())
+                self.accumulative_meter.add("rejected_rewards", rejected_rewards_mean.to(torch.float16).mean().item())
+                self.accumulative_meter.add("loss", loss_mean.to(torch.float16).item())
+                self.accumulative_meter.add("log_odds_ratio", log_odds_ratio.to(torch.float16).mean().item())
+                self.accumulative_meter.add("accuracy", reward_accuracies_mean.to(torch.float16).item())
+
+        msg = "Evaluation Result:\n"
+        for tag in ["loss", "chosen_rewards", "rejected_rewards", "log_odds_ratio", "accuracy"]:
+            msg = msg + f"{tag}: {self.accumulative_meter.get(tag)}\n"
+        self.coordinator.print_on_master(msg)
+        os.makedirs(self.save_dir, exist_ok=True)
+        with open(os.path.join(self.save_dir, f"eval_result_epoch{epoch}.txt"), "w") as f:
+            f.write(msg)
+        step_bar.close()
diff --git a/applications/ColossalChat/coati/trainer/rm.py b/applications/ColossalChat/coati/trainer/rm.py
index 0fb714a62bce..b9e84ef557fa 100755
--- a/applications/ColossalChat/coati/trainer/rm.py
+++ b/applications/ColossalChat/coati/trainer/rm.py
@@ -237,6 +237,7 @@ def _eval(self, epoch):
                 + f"distance: {self.accumulative_meter.get('chosen_rewards')-self.accumulative_meter.get('rejected_rewards')}\n"
             )
             self.coordinator.print_on_master(msg)
+            os.makedirs(self.save_dir, exist_ok=True)
             with open(os.path.join(self.save_dir, f"eval_result_epoch{epoch}.txt"), "w") as f:
                 f.write(msg)
             step_bar.close()
diff --git a/applications/ColossalChat/coati/trainer/sft.py b/applications/ColossalChat/coati/trainer/sft.py
index c95f5b65a822..c09d61034984 100755
--- a/applications/ColossalChat/coati/trainer/sft.py
+++ b/applications/ColossalChat/coati/trainer/sft.py
@@ -102,6 +102,7 @@ def _train(self, epoch: int):
             batch_size = batch["input_ids"].size(0)
             outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"])
             loss = outputs.loss
+
             self.booster.backward(loss=loss, optimizer=self.optimizer)
 
             loss_mean = all_reduce_mean(tensor=loss)
@@ -113,6 +114,7 @@ def _train(self, epoch: int):
                 self.optimizer.zero_grad()
                 self.scheduler.step()
 
+                step_bar.set_postfix({"train/loss": self.accumulative_meter.get("loss")})
                 if self.writer:
                     self.writer.add_scalar("train/loss", self.accumulative_meter.get("loss"), self.num_train_step)
                     self.writer.add_scalar("train/lr", self.scheduler.get_last_lr()[0], self.num_train_step)
@@ -165,6 +167,7 @@ def _eval(self, epoch: int):
             for tag in ["loss"]:
                 msg = msg + f"{tag}: {self.accumulative_meter.get(tag)}\n"
             self.coordinator.print_on_master(msg)
+            os.makedirs(self.save_dir, exist_ok=True)
             with open(os.path.join(self.save_dir, f"eval_result_epoch{epoch}.txt"), "w") as f:
                 f.write(msg)
             step_bar.close()
diff --git a/applications/ColossalChat/config/conversation_template/Qwen_Qwen1.5-32B-Chat.json b/applications/ColossalChat/config/conversation_template/Qwen_Qwen1.5-32B-Chat.json
new file mode 100644
index 000000000000..58941a5918ff
--- /dev/null
+++ b/applications/ColossalChat/config/conversation_template/Qwen_Qwen1.5-32B-Chat.json
@@ -0,0 +1,9 @@
+{
+    "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+    "system_message": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+    "stop_ids": [
+        151645,
+        151643
+    ],
+    "end_of_assistant": "<|im_end|>"
+}
diff --git a/applications/ColossalChat/config/conversation_template/tiny-llama.json b/applications/ColossalChat/config/conversation_template/tiny-llama.json
new file mode 100644
index 000000000000..59196159f930
--- /dev/null
+++ b/applications/ColossalChat/config/conversation_template/tiny-llama.json
@@ -0,0 +1,8 @@
+{
+    "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n'  + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
+    "system_message": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+    "stop_ids": [
+        2
+    ],
+    "end_of_assistant": "</s>"
+}
diff --git a/applications/ColossalChat/examples/README.md b/applications/ColossalChat/examples/README.md
index a29fc7508e60..b749f197ed21 100755
--- a/applications/ColossalChat/examples/README.md
+++ b/applications/ColossalChat/examples/README.md
@@ -9,6 +9,7 @@
   - [Install Requirements](#install-requirements)
   - [Get Start with ColossalRun](#get-start-with-colossalrun)
   - [Training Configuration](#training-configuration)
+  - [Parameter Efficient Finetuning (PEFT)](#parameter-efficient-finetuning-peft)
   - [RLHF Stage 1: Supervised Instruction Tuning](#rlhf-training-stage1---supervised-instructs-tuning)
     - [Step 1: Data Collection](#step-1-data-collection)
     - [Step 2: Preprocessing](#step-2-preprocessing)
@@ -29,6 +30,9 @@
   - [Alternative Option For RLHF: Direct Preference Optimization](#alternative-option-for-rlhf-direct-preference-optimization)
     - [DPO Stage 1: Supervised Instruction Tuning](#dpo-training-stage1---supervised-instructs-tuning)
     - [DPO Stage 2: DPO Training](#dpo-training-stage2---dpo-training)
+  - [Alternative Option For RLHF: Simple Preference Optimization](#alternative-option-for-rlhf-simple-preference-optimization)
+  - [Alternative Option For RLHF: Kahneman-Tversky Optimization (KTO)](#alternative-option-for-rlhf-kahneman-tversky-optimization-kto)
+  - [Alternative Option For RLHF: Odds Ratio Preference Optimization](#alternative-option-for-rlhf-odds-ratio-preference-optimization)
   - [List of Supported Models](#list-of-supported-models)
   - [Hardware Requirements](#hardware-requirements)
   - [Inference example](#inference-example)
@@ -45,9 +49,6 @@
 pip install -r requirements.txt
 ```
 
-
-
-
 ## Get Start with ColossalRun
 
 
@@ -81,8 +82,6 @@ Make sure the master node can access all nodes (including itself) by ssh without
 This section gives a simple introduction on different training strategies that you can use and how to use them with our boosters and plugins to reduce training time and VRAM consumption. For more details regarding training strategies, please refer to [here](https://colossalai.org/docs/concepts/paradigms_of_parallelism). For details regarding boosters and plugins, please refer to [here](https://colossalai.org/docs/basics/booster_plugins).
 
 
-
-
 <details><summary><b>Gemini (Zero3)</b></summary>
 
 
@@ -374,35 +373,6 @@ colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile trai
 </details>
 
 
-<details><summary><b>Low Rank Adaption</b></summary>
-
-
-Details about Low Rank Adaption (LoRA) can be found in the paper: [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685). It dramatically reduces the VRAM consumption at the cost of sacrifice model capability. It is suitable for training LLM with constrained resources.
-
-
-To enable LoRA, set --lora_rank to a positive value (usually between 20 and 64).
-```
-colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile train_sft.py \
-    --pretrain $PRETRAINED_MODEL_PATH \
-    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
-    --dataset ${dataset[@]} \
-    --save_interval 5000 \
-    --save_path $SAVE_DIR \
-    --config_file $CONFIG_FILE \
-    --plugin zero2_cpu \
-    --batch_size 4 \
-    --max_epochs 1 \
-    --accumulation_steps 4 \
-    --lr 2e-5 \
-    --max_len 2048 \
-    --lora_rank 32 \ # This enables LoRA
-    --use_wandb
-```
-
-
-</details>
-
-
 <details><summary><b>Other Training Arguments</b></summary>
 
 
@@ -427,6 +397,60 @@ colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile trai
 - use_wandb: if this flag is up, you can view logs on wandb.
 
 
+</details>
+
+### Parameter Efficient Finetuning (PEFT)
+
+Currently, we have support LoRA (low-rank adaptation) and PiSSA (principal singular values and singular vectors adaptation). Both help to reduce the running-time VRAM consumption as well as timing at the cost of overall model performance.
+
+
+<details><summary><b>Low Rank Adaption and PiSSA</b></summary>
+
+
+Details about Low Rank Adaption (LoRA) can be found in the paper: [LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685). Details about Principal Singular Values and Singular Vectors Adaptation (PiSSA) can be found in the paper: [PiSSA: Principal Singular Values and Singular Vectors Adaptation of Large Language Models](https://arxiv.org/abs/2404.02948). Both help to reduce the running-time VRAM consumption as well as timing at the cost of overall model performance. It is suitable for training LLM with constrained resources.
+
+To use LoRA/PiSSA in training, please create a config file as in the following example and set the `--lora_config` to that configuration file.
+
+```json
+{
+    "r": 128,
+    "embedding_lora_dropout": 0.0,
+    "linear_lora_dropout": 0.1,
+    "lora_alpha": 32,
+    "lora_train_bias": "all",
+    "lora_initialization_method": "PiSSA",
+    "target_modules": ["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj", "embed_tokens"]
+}
+```
+#### Lora Parameters
+- r: lora rank
+- embedding_lora_dropout: dropout probability for embedding layer
+- linear_lora_dropout: dropout probability for linear layer
+- lora_alpha: lora alpha, controls how much the adaptor can deviate from the pretrained model.
+- lora_train_bias: whether to add trainable bias to lora layers, choose from "all" (all layers (including but not limited to lora layers) will have trainable biases), "none" (no trainable biases), "lora" (only lora layers will have trainable biases)
+- lora_initialization_method: how to initialize lora weights, choose one from ["kaiming_uniform", "PiSSA"], default to "kaiming_uniform". Use "kaiming_uniform" for standard LoRA and "PiSSA" for PiSSA.
+- target_modules: which module(s) should be converted to lora layers, if the module's name contain the keywords in target modules and the module is a linear or embedding layer, the module will be converted. Otherwise, the module will be frozen. Setting this field to None will automatically convert all linear and embedding layer to their LoRA counterparts. Note that this example only works for LLaMA, for other models, you need to modify it.
+
+
+```
+colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile train_sft.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --dataset ${dataset[@]} \
+    --save_interval 5000 \
+    --save_path $SAVE_DIR \
+    --config_file $CONFIG_FILE \
+    --plugin zero2_cpu \
+    --batch_size 4 \
+    --max_epochs 1 \
+    --accumulation_steps 4 \
+    --lr 2e-5 \
+    --max_len 2048 \
+    --lora_config /PATH/TO/THE/LORA/CONFIG/FILE.json \ # Setting this enables LoRA
+    --use_wandb
+```
+
+
 </details>
 
 
@@ -445,7 +469,7 @@ The first step in Stage 1 is to collect a dataset of human demonstrations of the
     {"messages":
       [
         {
-          "from": "human",
+          "from": "user",
           "content": "what are some pranks with a pen i can do?"
         },
         {
@@ -470,9 +494,15 @@ In this code we provide a flexible way for users to set the conversation templat
 - Step 1: (Optional). Define your conversation template. You need to provide a conversation template config file similar to the config files under the ./config/conversation_template directory. This config should include the following fields.
   ```json
   {
-      "chat_template": (Optional), A string of chat_template used for formatting chat data. If not set (None), will use the default chat template of the provided tokenizer. If a path to a huggingface model or local model is provided, will use the chat_template of that model. To use a custom chat template, you need to manually set this field. For more details on how to write a chat template in Jinja format, please read https://huggingface.co/docs/transformers/main/chat_templating,
-      "system_message": A string of system message to be added at the beginning of the prompt. If no is provided (None), no system message will be added,
-      "end_of_assistant": The token(s) in string that denotes the end of assistance's response. For example, in the ChatGLM2 prompt format,
+      "chat_template": "A string of chat_template used for formatting chat data",
+      "system_message": "A string of system message to be added at the beginning of the prompt. If no is provided (None), no system message will be added",
+      "end_of_assistant": "The token(s) in string that denotes the end of assistance's response",
+      "stop_ids": "A list of integers corresponds to the `end_of_assistant` tokens that indicate the end of assistance's response during the rollout stage of PPO training"
+  }
+  ```
+  * `chat_template`: (Optional), A string of chat_template used for formatting chat data. If not set (None), will use the default chat template of the provided tokenizer. If a path to a huggingface model or local model is provided, will use the chat_template of that model. To use a custom chat template, you need to manually set this field. For more details on how to write a chat template in Jinja format, please read https://huggingface.co/docs/transformers/main/chat_templating.
+  * `system_message`: A string of system message to be added at the beginning of the prompt. If no is provided (None), no system message will be added.
+  * `end_of_assistant`: The token(s) in string that denotes the end of assistance's response". For example, in the ChatGLM2 prompt format,
       ```
       <|im_start|>system
       system messages
@@ -481,15 +511,13 @@ In this code we provide a flexible way for users to set the conversation templat
       <|im_start|>user
        How far is the moon? <|im_end|>
       <|im_start|>assistant\n The moon is about 384,400 kilometers away from Earth.<|im_end|>...
-       ```
-       the end_of_assistant tokens are "<|im_end|>"
-      "stop_ids": (Optional), A list of integers corresponds to the `end_of_assistant` tokens that indicate the end of assistance's response during the rollout stage of PPO training. It's recommended to set this manually for PPO training. If not set, will set to tokenizer.eos_token_ids automatically
-  }
-  ```
-  On your first run of the data preparation script, you only need to define the "chat_template" (if you want to use custom chat template) and the "system message" (if you want to use a custom system message),
+      ```
+      the `end_of_assistant` tokens are "<|im_end|>"
+  * `stop_ids`: (Optional), A list of integers corresponds to the `end_of_assistant` tokens that indicate the end of assistance's response during the rollout stage of PPO training. It's recommended to set this manually for PPO training. If not set, will set to tokenizer.eos_token_ids automatically.
 
+  On your first run of the data preparation script, you only need to define the `chat_template` (if you want to use custom chat template) and the `system message` (if you want to use a custom system message)
 
-- Step 2: Run the data preparation script--- [prepare_sft_dataset.sh](./examples/data_preparation_scripts/prepare_sft_dataset.sh). Note that whether or not you have skipped the first step, you need to provide the path to the conversation template config file (via the conversation_template_config arg). If you skipped the first step, an auto-generated conversation template will be stored at the designated file path.
+- Step 2: Run the data preparation script--- [prepare_sft_dataset.sh](./data_preparation_scripts/prepare_sft_dataset.sh). Note that whether or not you have skipped the first step, you need to provide the path to the conversation template config file (via the conversation_template_config arg). If you skipped the first step, an auto-generated conversation template will be stored at the designated file path.
 
 
 - Step 3: (Optional) Check the correctness of the processed data. We provided an easy way for you to do a manual checking on the processed data by checking the "$SAVE_DIR/jsonl/part-XXXX.jsonl" files.
@@ -509,7 +537,7 @@ Human: <s> what are some pranks with a pen i can do?</s> Assistant: <s> Are you
 
 
 #### Step 3: Training
-Choose a suitable model architecture for your task. Note that your model should be compatible with the tokenizer that you used to tokenize the SFT dataset. You can run [train_sft.sh](./examples/training_scripts/train_sft.sh) to start a supervised instructs fine-tuning. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
+Choose a suitable model architecture for your task. Note that your model should be compatible with the tokenizer that you used to tokenize the SFT dataset. You can run [train_sft.sh](./training_scripts/train_sft.sh) to start a supervised instructs fine-tuning. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
 
 
 ### RLHF Training Stage2 - Training Reward Model
@@ -526,7 +554,7 @@ Below shows the preference dataset format used in training the reward model.
 [
     {"context": [
         {
-          "from": "human",
+          "from": "user",
           "content": "Introduce butterflies species in Oregon."
         }
       ]
@@ -551,11 +579,11 @@ Below shows the preference dataset format used in training the reward model.
 
 
 #### Step 2: Preprocessing
-Similar to the second step in the previous stage, we format the reward data into the same structured format as used in step 2 of the SFT stage. You can run [prepare_preference_dataset.sh](./examples/data_preparation_scripts/prepare_preference_dataset.sh) to prepare the preference data for reward model training.
+Similar to the second step in the previous stage, we format the reward data into the same structured format as used in step 2 of the SFT stage. You can run [prepare_preference_dataset.sh](./data_preparation_scripts/prepare_preference_dataset.sh) to prepare the preference data for reward model training.
 
 
 #### Step 3: Training
-You can run [train_rm.sh](./examples/training_scripts/train_rm.sh) to start the reward model training. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
+You can run [train_rm.sh](./training_scripts/train_rm.sh) to start the reward model training. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
 
 
 #### Features and Tricks in RM Training
@@ -595,7 +623,7 @@ In stage3 we will use reinforcement learning algorithm--- Proximal Policy Optimi
 
 
 #### Step 1: Data Collection
-PPO uses two kinds of training data--- the prompt data and the pretrain data (optional). The first dataset is mandatory, data samples within the prompt dataset ends with a line from "human" and thus the "assistant" needs to generate a response to answer to the "human". Note that you can still use conversation that ends with a line from the "assistant", in that case, the last line will be dropped. Here is an example of the prompt dataset format.
+PPO uses two kinds of training data--- the prompt data and the pretrain data (optional). The first dataset is mandatory, data samples within the prompt dataset ends with a line from "user" and thus the "assistant" needs to generate a response to answer to the "user". Note that you can still use conversation that ends with a line from the "assistant", in that case, the last line will be dropped. Here is an example of the prompt dataset format.
 
 
 ```json
@@ -603,7 +631,7 @@ PPO uses two kinds of training data--- the prompt data and the pretrain data (op
     {"messages":
       [
         {
-          "from": "human",
+          "from": "user",
           "content": "what are some pranks with a pen i can do?"
         }
         ...
@@ -626,14 +654,14 @@ The second dataset--- pretrained dataset is optional, provide it if you want to
   ]
   ```
 #### Step 2: Preprocessing
-To prepare the prompt dataset for PPO training, simply run [prepare_prompt_dataset.sh](./examples/data_preparation_scripts/prepare_prompt_dataset.sh)
+To prepare the prompt dataset for PPO training, simply run [prepare_prompt_dataset.sh](./data_preparation_scripts/prepare_prompt_dataset.sh)
 
 
 You can use the SFT dataset you prepared in the SFT stage or prepare a new one from different source for the ptx dataset. The ptx data is used to calculate ptx loss, which stabilizes the training according to the [InstructGPT paper](https://arxiv.org/pdf/2203.02155.pdf).
 
 
 #### Step 3: Training
-You can run the [train_ppo.sh](./examples/training_scripts/train_ppo.sh) to start PPO training. Here are some unique arguments for PPO, please refer to the training configuration section for other training configuration. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
+You can run the [train_ppo.sh](./training_scripts/train_ppo.sh) to start PPO training. Here are some unique arguments for PPO, please refer to the training configuration section for other training configuration. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
 
 
 ```bash
@@ -717,17 +745,90 @@ For DPO training, you only need the preference dataset. Please follow the instru
 
 
 #### Step 2: Training
-You can run the [train_dpo.sh](./examples/training_scripts/train_dpo.sh) to start DPO training. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options.
+You can run the [train_dpo.sh](./training_scripts/train_dpo.sh) to start DPO training. Please refer to the [training configuration](#training-configuration) section for details regarding supported training options. Following the trend of recent research on DPO-like alignment methods, we added option for the user to choose from, including whether to do length normalization , reward shaping and whether to use a reference model in calculating implicit reward. Here are those options,
 
+```
+--beta 0.1 \     # the temperature in DPO loss, Default to 0.1
+--gamma 0.0 \     # the reward target margin in the SimPO paper, Default to 0.
+--disable_reference_model \   # whether to disable the reference model, if set, the implicit reward will be calculated solely from the actor. Default to enable reference model in DPO
+--length_normalization \  # whether to apply length normalization, Default to not use
+```
 
 #### DPO Result
 <p align="center">
 <img width="1000" alt="image" src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/DPO.png">
 </p>
 
+### Alternative Option For RLHF: Simple Preference Optimization
+
+We support the method introduced in the paper [SimPO: Simple Preference Optimization
+with a Reference-Free Reward](https://arxiv.org/pdf/2405.14734) (SimPO). Which is a reference model free aligment method that add length normalization and reward shaping to the DPO loss to enhance training stability and efficiency. As the method doesn't deviate too much from DPO, we add support for length normalization and SimPO reward shaping in our DPO implementation. To use SimPO in alignment, use the [train_dpo.sh](./training_scripts/train_dpo.sh) script, set the `loss_type` to `simpo_loss`, you can also set the value for temperature (`beta`) and reward target margin (`gamma`) but it is optional.
+
+#### SimPO Result
+<p align="center">
+<img width="1000" alt="image" src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/SimPO_margin.png">
+</p>
+
+
+### Alternative Option For RLHF: Odds Ratio Preference Optimization
+We support the method introduced in the paper [ORPO: Monolithic Preference Optimization without Reference Model](https://arxiv.org/abs/2403.07691) (ORPO). Which is a reference model free aligment method that mixes the SFT loss with a reinforcement learning loss that uses odds ratio as the implicit reward to enhance training stability and efficiency. To use ORPO in alignment, use the [train_orpo.sh](./training_scripts/train_orpo.sh) script, You can set the value for `lambda` (which determine how strongly the reinforcement learning loss affect the training) but it is optional.
+
+#### ORPO Result
+<p align="center">
+<img width="1000" alt="image" src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/ORPO_margin.png">
+</p>
+
+### Alternative Option For RLHF: Kahneman-Tversky Optimization (KTO)
+We support the method introduced in the paper [KTO:Model Alignment as Prospect Theoretic Optimization](https://arxiv.org/pdf/2402.01306) (KTO). Which is a aligment method that directly maximize "human utility" of generation results.
+
+For KTO data preparation, please use the script [prepare_kto_dataset.sh](./examples/data_preparation_scripts/prepare_kto_dataset.sh). You will need preference data, different from DPO and its derivatives, you no longer need a pair of chosen/rejected response for the same input. You only need data whose response is associated with a preference label--- whether the response is okay or not, read the papre for more details. You also need to convert your data to the following intermediate format before you run the data preparation script.
+
+```jsonl
+{
+  "prompt": [
+    {
+      "from": "user",
+      "content": "What are some praise words in english?"
+    },
+    {
+      "from": "assistant",
+      "content": "Here's an incomplete list.\n\nexcellent, fantastic, impressive  ..."
+    },
+    {
+      "from": "user",
+      "content": "What's your favorite one?"
+    }
+  ],
+  "completion": {
+    "from": "assistant",
+    "content": "impressive."
+  },
+  "label": true
+}
+
+```
+
+For training, use the [train_kto.sh](./examples/training_scripts/train_orpo.sh) script, You may need to set the value for `beta` (which determine how strongly the reinforcement learning loss affect the training), `desirable_weight` and `undesirable_weight` if your data is biased (has unequal number of chosen and rejected samples).
+
+#### KTO Result
+<p align="center">
+<img width="1000" alt="image" src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/KTO.png">
+</p>
 
 ## Hardware Requirements
-For PPO, we suggest using Tensor Parallelism. The following table shows the VRAM consumption of training a 7B model on a dummy dataset with 2048 sequence length and 512 layout length with different tp_size (equal to the number of GPUs). In this experiment, we use an H800 GPU with 80GB VRAM.
+
+For SFT, we recommend using zero2 or zero2-cpu for 7B model and tp is your model is extra large. We tested the VRAM consumption on a dummy dataset with a sequence length of 2048. In all experiments, we use H800 GPUs with 80GB VRAM and enable gradient checkpointing and flash attention.
+- 2 H800 GPU
+  - zero2-cpu, micro batch size=4, VRAM Usage=22457.98 MB
+  - zero2, micro batch size=4, VRAM Usage=72390.95 MB
+- 4 H800 GPUs
+  - zero2_cpu, micro batch size=8, VRAM Usage=19412.77 MB
+  - zero2, micro batch size=8, VRAM Usage=43446.31 MB
+  - zero2, micro batch size=16, VRAM Usage=58082.30 MB
+  - zero2, micro batch size=8, lora_rank=8, VRAM Usage=21167.73 MB
+  - zero2, micro batch size=8, lora_rank=32, VRAM Usage=21344.17 MB
+
+For PPO, we suggest using Tensor Parallelism. The following table shows the VRAM consumption of training a 7B model (llama2-7B-hf) on a dummy dataset with a sequence length of 2048 and a layout length of 512 with different tp_size (equal to the number of GPUs).
 | PPO   | tp=8          | tp=4          |
 |-------|---------------|---------------|
 | bs=1  | 18485.19 MB   | 42934.45 MB   |
@@ -738,12 +839,39 @@ For PPO, we suggest using Tensor Parallelism. The following table shows the VRAM
 
 For DPO, we recommend using zero2 or zero2-cpu. We tested the VRAM consumption on a dummy dataset with 2048 sequence length.
 
+- 2 H800 GPU
+  - zero2-cpu, micro batch size=2, VRAM Usage=36989.37 MB
+  - zero2-cpu, micro batch size=4, VRAM Usage=48081.67 MB
+- 4 H800 GPUs
+  - zero2, micro batch size=4, VRAM Usage=67483.44 MB
+
+For SimPO, we recommend using zero2 or zero2-cpu. We tested the VRAM consumption on a dummy dataset with 2048 sequence length.
+
+- 2 H800 GPU
+  - zero2-cpu, micro batch size=4, VRAM 25705.26 MB
+  - zero2, micro batch size=4, VRAM Usage=73375.04 MB
+- 4 H800 GPUs
+  - zero2_cpu, micro batch size=8, VRAM Usage=36709.36 MB
+  - zero2, micro batch size=4, VRAM Usage=44330.90 MB
+  - zero2, micro batch size=8, VRAM Usage=56086.12 MB
+
+For ORPO, we recommend using zero2 or zero2-cpu. We tested the VRAM consumption on a dummy dataset with 2048 sequence length.
 
-- 1 H800 GPU
-  - zero2-cpu, batch size=2, VRAM Usage=49873.90 MB
-  - zero2-cpu, batch size=4, VRAM Usage=60998.22 MB
+- 2 H800 GPU
+  - zero2-cpu, micro batch size=4, VRAM 26693.38 MB
+  - zero2, micro batch size=4, VRAM Usage=74332.65 MB
+- 4 H800 GPUs
+  - zero2_cpu, micro batch size=8, VRAM Usage=38709.73 MB
+  - zero2, micro batch size=4, VRAM Usage=45309.52 MB
+  - zero2, micro batch size=8, VRAM Usage=58086.37 MB
+
+For KTO, we recommend using zero2-cpu or zero2 plugin, We tested the VRAM consumption on a dummy dataset with 2048 sequence length.
+- 2 H800 GPU
+  - zero2-cpu, micro batch size=2, VRAM Usage=35241.98 MB
+  - zero2-cpu, micro batch size=4, VRAM Usage=38989.37 MB
 - 4 H800 GPUs
-  - zero2, batch size=4, VRAM Usage=67544.47 MB
+  - zero2_cpu, micro batch size=2, VRAM_USAGE=32443.22 MB
+  - zero2, micro batch size=4, VRAM_USAGE=59307.97 MB
 
 ## List of Supported Models
 
diff --git a/applications/ColossalChat/examples/data_preparation_scripts/prepare_dataset.py b/applications/ColossalChat/examples/data_preparation_scripts/prepare_dataset.py
index 64093f88d7ca..a35f2bf52dfd 100644
--- a/applications/ColossalChat/examples/data_preparation_scripts/prepare_dataset.py
+++ b/applications/ColossalChat/examples/data_preparation_scripts/prepare_dataset.py
@@ -40,7 +40,7 @@
 import time
 from multiprocessing import cpu_count
 
-from coati.dataset import setup_conversation_template, supervised_tokenize_sft, tokenize_prompt_dataset, tokenize_rlhf
+from coati.dataset import setup_conversation_template, tokenize_kto, tokenize_prompt, tokenize_rlhf, tokenize_sft
 from datasets import dataset_dict, load_dataset
 from transformers import AutoTokenizer
 
@@ -56,8 +56,8 @@ def main():
         type=str,
         required=True,
         default=None,
-        choices=["sft", "prompt", "preference"],
-        help="Type of dataset, chose from 'sft', 'prompt', 'preference'.",
+        choices=["sft", "prompt", "preference", "kto"],
+        help="Type of dataset, chose from 'sft', 'prompt', 'preference'. 'kto'",
     )
     parser.add_argument(
         "--data_input_dirs",
@@ -199,11 +199,13 @@ def main():
     )
 
     if args.type == "sft":
-        preparation_function = supervised_tokenize_sft
+        preparation_function = tokenize_sft
     elif args.type == "prompt":
-        preparation_function = tokenize_prompt_dataset
+        preparation_function = tokenize_prompt
     elif args.type == "preference":
         preparation_function = tokenize_rlhf
+    elif args.type == "kto":
+        preparation_function = tokenize_kto
     else:
         raise ValueError("Unknow dataset type. Please choose one from ['sft', 'prompt', 'preference']")
 
@@ -228,10 +230,13 @@ def main():
             keep_in_memory=False,
             num_proc=min(len(dataset), cpu_count()),
         )
-
-        dataset = dataset.filter(
-            lambda data: data["chosen_input_ids" if args.type == "preference" else "input_ids"] is not None
-        )
+        if args.type == "kto":
+            filter_by = "completion"
+        elif args.type == "preference":
+            filter_by = "chosen_input_ids"
+        else:
+            filter_by = "input_ids"
+        dataset = dataset.filter(lambda data: data[filter_by] is not None)
 
         # Save each jsonl spliced dataset.
         output_index = "0" * (5 - len(str(index))) + str(index)
diff --git a/applications/ColossalChat/examples/data_preparation_scripts/prepare_kto_dataset.sh b/applications/ColossalChat/examples/data_preparation_scripts/prepare_kto_dataset.sh
new file mode 100755
index 000000000000..42c7852898d5
--- /dev/null
+++ b/applications/ColossalChat/examples/data_preparation_scripts/prepare_kto_dataset.sh
@@ -0,0 +1,14 @@
+SAVE_DIR=""
+
+rm -rf $SAVE_DIR/cache
+rm -rf $SAVE_DIR/jsonl
+rm -rf $SAVE_DIR/arrow
+
+python prepare_dataset.py --type kto \
+    --data_input_dirs /PATH/TO/KTO/DATASET \
+    --conversation_template_config /PATH/TO/CHAT/TEMPLATE/CONFIG.json \
+    --tokenizer_dir  "" \
+    --data_cache_dir $SAVE_DIR/cache \
+    --data_jsonl_output_dir $SAVE_DIR/jsonl \
+    --data_arrow_output_dir $SAVE_DIR/arrow \
+    --max_length 1024
diff --git a/applications/ColossalChat/examples/data_preparation_scripts/prepare_preference_dataset.sh b/applications/ColossalChat/examples/data_preparation_scripts/prepare_preference_dataset.sh
index 999d7778be52..5c06b43fe076 100755
--- a/applications/ColossalChat/examples/data_preparation_scripts/prepare_preference_dataset.sh
+++ b/applications/ColossalChat/examples/data_preparation_scripts/prepare_preference_dataset.sh
@@ -5,9 +5,10 @@ rm -rf $SAVE_DIR/jsonl
 rm -rf $SAVE_DIR/arrow
 
 python prepare_dataset.py --type preference \
-    --data_input_dirs "PATH/TO/PREFERENCE/DATA" \
+    --data_input_dirs /PATH/TO/PREFERENCE/DATASET \
     --conversation_template_config /PATH/TO/CHAT/TEMPLATE/CONFIG.json \
     --tokenizer_dir  "" \
     --data_cache_dir $SAVE_DIR/cache \
     --data_jsonl_output_dir $SAVE_DIR/jsonl \
-    --data_arrow_output_dir $SAVE_DIR/arrow
+    --data_arrow_output_dir $SAVE_DIR/arrow \
+    --max_length 1024
diff --git a/applications/ColossalChat/examples/data_preparation_scripts/prepare_prompt_dataset.sh b/applications/ColossalChat/examples/data_preparation_scripts/prepare_prompt_dataset.sh
index 8d3d6c2c2d80..d74667889e27 100755
--- a/applications/ColossalChat/examples/data_preparation_scripts/prepare_prompt_dataset.sh
+++ b/applications/ColossalChat/examples/data_preparation_scripts/prepare_prompt_dataset.sh
@@ -10,4 +10,5 @@ python prepare_dataset.py --type prompt \
     --tokenizer_dir  "" \
     --data_cache_dir $SAVE_DIR/cache \
     --data_jsonl_output_dir $SAVE_DIR/jsonl \
-    --data_arrow_output_dir $SAVE_DIR/arrow
+    --data_arrow_output_dir $SAVE_DIR/arrow \
+    --max_length 1024
diff --git a/applications/ColossalChat/examples/data_preparation_scripts/prepare_sft_dataset.sh b/applications/ColossalChat/examples/data_preparation_scripts/prepare_sft_dataset.sh
index 8562b47ee996..84bae0027c83 100755
--- a/applications/ColossalChat/examples/data_preparation_scripts/prepare_sft_dataset.sh
+++ b/applications/ColossalChat/examples/data_preparation_scripts/prepare_sft_dataset.sh
@@ -5,9 +5,10 @@ rm -rf $SAVE_DIR/jsonl
 rm -rf $SAVE_DIR/arrow
 
 python prepare_dataset.py --type sft \
-    --data_input_dirs "PATH/TO/SFT/DATA" \
+    --data_input_dirs /PATH/TO/SFT/DATASET \
     --conversation_template_config /PATH/TO/CHAT/TEMPLATE/CONFIG.json \
     --tokenizer_dir  "" \
     --data_cache_dir $SAVE_DIR/cache \
     --data_jsonl_output_dir $SAVE_DIR/jsonl \
     --data_arrow_output_dir $SAVE_DIR/arrow \
+    --max_length 4096
diff --git a/applications/ColossalChat/examples/inference/round.txt b/applications/ColossalChat/examples/inference/round.txt
new file mode 100644
index 000000000000..ba02074c1a03
--- /dev/null
+++ b/applications/ColossalChat/examples/inference/round.txt
@@ -0,0 +1,104 @@
+
+
+==========
+round 1:
+<s>[INST] <<SYS>>
+A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+
+
+<</SYS>>
+
+tell me a story [/INST] Great, let’s hear a story. </s>
+
+==========
+
+
+==========
+round 2:
+<s>[INST] <<SYS>>
+A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+
+
+<</SYS>>
+
+tell me a story [/INST] Great, let’s hear a story. </s><s>[INST] calculate 1+1 [/INST] 1+1=2 </s>
+
+==========
+
+
+==========
+round 3:
+<s>[INST] <<SYS>>
+A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+
+
+<</SYS>>
+
+tell me a story [/INST] Great, let’s hear a story. </s><s>[INST] calculate 1+1 [/INST] 1+1=2 </s><s>[INST] who is the first president of the USA [/INST] The first president of the United States was George Washington. </s>
+
+==========
+
+
+==========
+round 1:
+<s>[INST] <<SYS>>
+A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+
+
+<</SYS>>
+
+who is the first president of the USA? [/INST] The first president of the United States was George Washington. </s>
+
+==========
+
+
+==========
+round 2:
+<s>[INST] <<SYS>>
+A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+
+
+<</SYS>>
+
+who is the first president of the USA? [/INST] The first president of the United States was George Washington. </s><s>[INST] tell me a story [/INST] One story that might be interesting is the story of how the United States was founded.  In 1776, the Thirteen Colonies united together to form the new nation of America.  The first president of the new nation was George Washington.  The first state was Pennsylvania.  The first capital was Philadelphia.  The first presidential election was held in 1787, and was between George Read, a Federalist, and John Adams, a Republican.  The Federalists wanted to keep the power of the federal government limited, while the Republicans wanted the power to be spread around to the states.  The Federalists won, and George Washington was elected president. </s>
+
+==========
+
+
+==========
+round 1:
+<s>[INST] <<SYS>>
+A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+
+
+<</SYS>>
+
+tell me a story [/INST] I do enjoy telling stories. Is there a particular story you’d like to hear? </s>
+
+==========
+
+
+==========
+round 2:
+<s>[INST] <<SYS>>
+A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+
+
+<</SYS>>
+
+tell me a story [/INST] I do enjoy telling stories. Is there a particular story you’d like to hear? </s><s>[INST] about Donald Trump [/INST] I’d be happy to listen to a story about Donald Trump. I have no idea what a “story” is, but I do know a lot about human life and human behavior. I can try.  Do you have a theme or a question you’d like to ask about Donald Trump? </s>
+
+==========
+
+
+==========
+round 3:
+<s>[INST] <<SYS>>
+A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+
+
+<</SYS>>
+
+tell me a story [/INST] I do enjoy telling stories. Is there a particular story you’d like to hear? </s><s>[INST] about Donald Trump [/INST] I’d be happy to listen to a story about Donald Trump. I have no idea what a “story” is, but I do know a lot about human life and human behavior. I can try.  Do you have a theme or a question you’d like to ask about Donald Trump? </s><s>[INST] Is Donald Trump the president of the United States [/INST] Yes, Donald Trump became the 45th president of the United States in January of 2016. </s>
+
+==========
diff --git a/applications/ColossalChat/examples/requirements.txt b/applications/ColossalChat/examples/requirements.txt
index 838590f4b103..91f25a5cf843 100644
--- a/applications/ColossalChat/examples/requirements.txt
+++ b/applications/ColossalChat/examples/requirements.txt
@@ -1,4 +1,4 @@
 pandas>=1.4.1
 sentencepiece
-colossalai
+colossalai==0.4.0
 prompt_toolkit
diff --git a/applications/ColossalChat/examples/training_scripts/hostfile b/applications/ColossalChat/examples/training_scripts/hostfile
index c7aed75a331a..2fbb50c4a8dc 100755
--- a/applications/ColossalChat/examples/training_scripts/hostfile
+++ b/applications/ColossalChat/examples/training_scripts/hostfile
@@ -1,5 +1 @@
-XXX.XX.XXX.XXX   # Your master IP
-XXX.XX.XXX.XXX   # Your slave IPs
-XXX.XX.XXX.XXX   # Your slave IPs
-XXX.XX.XXX.XXX   # Your slave IPs
-XXX.XX.XXX.XXX   # Your slave IPs
+localhost
diff --git a/applications/ColossalChat/examples/training_scripts/lora_config.json b/applications/ColossalChat/examples/training_scripts/lora_config.json
new file mode 100644
index 000000000000..4565f9e9ba82
--- /dev/null
+++ b/applications/ColossalChat/examples/training_scripts/lora_config.json
@@ -0,0 +1,9 @@
+{
+    "r": 128,
+    "embedding_lora_dropout": 0.0,
+    "linear_lora_dropout": 0.1,
+    "lora_alpha": 32,
+    "lora_train_bias": "all",
+    "lora_initialization_method": "PiSSA",
+    "target_modules": ["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj", "embed_tokens"]
+}
diff --git a/applications/ColossalChat/examples/training_scripts/train_dpo.py b/applications/ColossalChat/examples/training_scripts/train_dpo.py
index a5b4cb3bd66e..44131f572445 100755
--- a/applications/ColossalChat/examples/training_scripts/train_dpo.py
+++ b/applications/ColossalChat/examples/training_scripts/train_dpo.py
@@ -6,7 +6,7 @@
 
 import torch
 from coati.dataset import DataCollatorForPreferenceDataset, StatefulDistributedSampler, load_tokenized_dataset
-from coati.models import convert_to_lora_module, disable_dropout
+from coati.models import LoraConfig, convert_to_lora_module, disable_dropout
 from coati.trainer import DPOTrainer
 from coati.utils import load_checkpoint
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -23,8 +23,11 @@
 
 
 def train(args):
+    lora_config = None
+    if args.lora_config is not None:
+        lora_config = LoraConfig.from_file(args.lora_config)
     # check lora compatibility
-    if "gemini" in args.plugin and args.lora_rank > 0:
+    if "gemini" in args.plugin and lora_config is not None and lora_config.r > 0:
         raise ValueError("LoRA is not supported in GeminiPlugin. Please use other plugin")
     if args.plugin == "gemini_auto" and args.accumulation_steps > 1:
         raise ValueError("Gradient accumulation is not supported in GeminiPlugin. Please use other plugin")
@@ -115,8 +118,8 @@ def train(args):
             coordinator.print_on_master(msg="Flash-attention enabled successfully")
         else:
             model = AutoModelForCausalLM.from_pretrained(args.pretrain)
-        disable_dropout(model)
-        if args.enable_reference_model:
+
+        if not args.disable_reference_model:
             if args.use_flash_attn:
                 ref_model = AutoModelForCausalLM.from_pretrained(
                     args.pretrain,
@@ -125,18 +128,20 @@ def train(args):
                 )
             else:
                 ref_model = AutoModelForCausalLM.from_pretrained(args.pretrain)
-            disable_dropout(ref_model)
         else:
             ref_model = None
+        if args.lora_config is not None:
+            model = convert_to_lora_module(model, lora_config=lora_config)
+            for name, module in model.named_modules():
+                if "norm" in name or "gate" in name:
+                    module = module.to(torch.float32)
+        disable_dropout(model)
+        disable_dropout(ref_model)
 
-        if args.lora_rank > 0:
-            model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias)
-
-    if args.grad_checkpoint and args.lora_rank == 0:
-        model.gradient_checkpointing_enable()
+    if args.grad_checkpoint:
+        # Note, for some models, lora may not be compatible with gradient checkpointing
+        model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
         coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
-    elif args.lora_rank > 0:
-        coordinator.print_on_master(msg="Gradient checkpointing will be disabled when LoRA is enabled")
 
     # configure tokenizer
     tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain
@@ -178,6 +183,21 @@ def train(args):
         collate_fn=data_collator,
         distributed_sampler_cls=StatefulDistributedSampler,
     )
+    eval_dataloader = None
+    if args.eval_dataset:
+        eval_dataset = load_tokenized_dataset(dataset_paths=args.eval_dataset, mode="dev")
+        eval_data_collator = DataCollatorForPreferenceDataset(tokenizer=tokenizer, max_length=args.max_length)
+
+        eval_dataloader = plugin.prepare_dataloader(
+            dataset=eval_dataset,
+            batch_size=args.batch_size,
+            shuffle=True,
+            drop_last=True,
+            collate_fn=eval_data_collator,
+            distributed_sampler_cls=StatefulDistributedSampler,
+        )
+    else:
+        logger.warning("No evaluation dataset is provided, skip evaluation")
 
     num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps
     if args.warmup_steps is None:
@@ -255,25 +275,28 @@ def train(args):
         save_interval=args.save_interval,
         save_dir=args.save_dir,
         coordinator=coordinator,
+        beta=args.beta,
+        gamma=args.gamma,
+        length_normalization=args.length_normalization,
     )
 
     trainer.fit(
         train_preference_dataloader=train_dataloader,
-        eval_preference_dataloader=None,
+        eval_preference_dataloader=eval_dataloader,
         log_dir=args.log_dir,
         use_wandb=args.use_wandb,
     )
 
-    if args.lora_rank > 0 and args.merge_lora_weights:
-        from coati.models.lora import LORA_MANAGER
-
+    if lora_config is not None and lora_config.r > 0:
         # NOTE: set model to eval to merge LoRA weights
-        LORA_MANAGER.merge_weights = True
         model.eval()
     # save model checkpoint after fitting on only rank0
-    coordinator.print_on_master("Start saving final model checkpoint")
-    booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True)
-    coordinator.print_on_master(f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_dir}")
+    if args.save_dir is not None:
+        coordinator.print_on_master("Start saving final model checkpoint")
+        booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True)
+        coordinator.print_on_master(
+            f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_dir}"
+        )
 
     coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
 
@@ -296,6 +319,10 @@ def train(args):
     parser.add_argument("--tp", type=int, default=1)
     parser.add_argument("--pp", type=int, default=1)
     parser.add_argument("--sp", type=int, default=1)
+    parser.add_argument("--loss_type", type=str, default="dpo_loss", help="dpo_loss or simpo_loss")
+    parser.add_argument("--beta", type=float, default=0.1, help="beta in DPO loss")
+    parser.add_argument("--gamma", type=float, default=0.0, help="gamma in SimPO loss")
+    parser.add_argument("--length_normalization", default=False, action="store_true")
     parser.add_argument("--enable_sequence_parallelism", default=False, action="store_true")
     parser.add_argument("--zero_stage", type=int, default=0, help="Zero stage", choices=[0, 1, 2])
     parser.add_argument("--zero_cpu_offload", default=False, action="store_true")
@@ -304,33 +331,39 @@ def train(args):
     parser.add_argument("--model_type", type=str, default=None)
     parser.add_argument("--tokenizer_dir", type=str, default=None)
     parser.add_argument("--dataset", nargs="+", default=[])
+    parser.add_argument("--eval_dataset", nargs="+", default=[])
     parser.add_argument(
         "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint"
     )
-    parser.add_argument("--config_file", type=str, default="config_file", help="Config file")
-    parser.add_argument("--save_dir", type=str, default="output")
+    parser.add_argument("--config_file", type=str, default=None, help="Config file")
+    parser.add_argument("--save_dir", type=str, default=None)
     parser.add_argument("--max_length", type=int, default=2048, help="Model max length")
     parser.add_argument("--max_epochs", type=int, default=3)
     parser.add_argument("--batch_size", type=int, default=4)
-    parser.add_argument("--enable_reference_model", type=bool, default=True)
-    parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision")
-    parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
     parser.add_argument(
-        "--lora_train_bias",
-        type=str,
-        default="none",
-        help="'none' means it doesn't train biases. 'all' means it trains all biases. 'lora_only' means it only trains biases of LoRA layers",
+        "--disable_reference_model",
+        action="store_true",
+        default=False,
+        help="Disable the reference model (enabled by default)",
     )
+    parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision")
+    parser.add_argument("--lora_config", type=str, default=None, help="low-rank adaptation config file path")
     parser.add_argument("--save_interval", type=int, default=1000, help="number of step between two checkpoints")
-    parser.add_argument("--merge_lora_weights", type=bool, default=True)
     parser.add_argument("--lr", type=float, default=5e-6)
     parser.add_argument("--accumulation_steps", type=int, default=8)
-    parser.add_argument("--log_dir", default="logs", type=str)
+    parser.add_argument("--log_dir", default=None, type=str)
     parser.add_argument("--use_wandb", default=False, action="store_true")
     parser.add_argument("--grad_checkpoint", default=False, action="store_true")
     parser.add_argument("--use_flash_attn", default=False, action="store_true")
     args = parser.parse_args()
-    os.makedirs(os.path.dirname(args.config_file), exist_ok=True)
-    with open(args.config_file, "w") as f:
-        json.dump(args.__dict__, f, indent=4)
+
+    # fool proof hyperparameter setup
+    if args.loss_type == "simpo_loss":
+        args.length_normalization = True
+        args.gamma = args.gamma if args.gamma > 0 else 1.4
+
+    if args.config_file is not None:
+        os.makedirs(os.path.dirname(args.config_file), exist_ok=True)
+        with open(args.config_file, "w") as f:
+            json.dump(args.__dict__, f, indent=4)
     train(args)
diff --git a/applications/ColossalChat/examples/training_scripts/train_dpo.sh b/applications/ColossalChat/examples/training_scripts/train_dpo.sh
index 80fc30c3d955..4d49bc2188eb 100755
--- a/applications/ColossalChat/examples/training_scripts/train_dpo.sh
+++ b/applications/ColossalChat/examples/training_scripts/train_dpo.sh
@@ -13,50 +13,52 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
     echo "Now CUDA_VISIBLE_DEVICES is set to:"
     echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
 }
-set_n_least_used_CUDA_VISIBLE_DEVICES 8
-# export CUDA_VISIBLE_DEVICES=6
+set_n_least_used_CUDA_VISIBLE_DEVICES 4
 
-PROJECT_NAME="dpo"
+PROJECT_NAME="DPO"
 PARENT_SAVE_DIR="" # Path to a folder to save checkpoints
-PARENT_TENSORBOARD_DIR="" # Path to a folder to save logs
 PARENT_CONFIG_FILE="" # Path to a folder to save training config logs
+PARENT_LOG_DIR="" # Path to a folder to save training config logs
 PRETRAINED_MODEL_PATH="" # huggingface or local model path
 PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path
 
 declare -a dataset=(
-    YOUR/DATA/DIR/arrow/part-00000
-    YOUR/DATA/DIR/arrow/part-00001
-    YOUR/DATA/DIR/arrow/part-00002
-    YOUR/DATA/DIR/arrow/part-00003
-    YOUR/DATA/DIR/arrow/part-00004
-    YOUR/DATA/DIR/arrow/part-00005
-    YOUR/DATA/DIR/arrow/part-00006
-    YOUR/DATA/DIR/arrow/part-00007
-    YOUR/DATA/DIR/arrow/part-00008
-    YOUR/DATA/DIR/arrow/part-00009
+    /Your/Preference/Data/arrow/part-00000
+    /Your/Preference/Data/arrow/part-00001
+    /Your/Preference/Data/arrow/part-00002
+    /Your/Preference/Data/arrow/part-00003
+    /Your/Preference/Data/arrow/part-00004
+    /Your/Preference/Data/arrow/part-00005
+    /Your/Preference/Data/arrow/part-00006
+    /Your/Preference/Data/arrow/part-00007
+    /Your/Preference/Data/arrow/part-00008
+    /Your/Preference/Data/arrow/part-00009
 )
 
 TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
 FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
 SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
-CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
+CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json"
+LOG_DIR="${PARENT_LOG_DIR}${FULL_PROJECT_NAME}"
 
-colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 31312 train_dpo.py \
+colossalai run --nproc_per_node 4 --hostfile hostfile --master_port 31313 train_dpo.py \
     --pretrain $PRETRAINED_MODEL_PATH \
-    --checkpoint_path $PRETRAINED_MODEL_PATH \
     --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
     --dataset ${dataset[@]} \
     --plugin "zero2" \
     --save_interval 1000 \
     --save_dir $SAVE_DIR \
     --config_file $CONFIG_FILE \
+    --log_dir $LOG_DIR \
     --max_epochs 1 \
-    --accumulation_steps 4 \
-    --batch_size 2 \
+    --accumulation_steps 2 \
+    --batch_size 16 \
     --lr 1e-6 \
+    --beta 0.1 \
     --mixed_precision "bf16" \
     --grad_clip 1.0 \
+    --max_length 4096 \
     --weight_decay 0.01 \
-    --warmup_steps 100 \
+    --warmup_steps 60 \
     --grad_checkpoint \
     --use_wandb
diff --git a/applications/ColossalChat/examples/training_scripts/train_kto.py b/applications/ColossalChat/examples/training_scripts/train_kto.py
new file mode 100755
index 000000000000..d063b82bb214
--- /dev/null
+++ b/applications/ColossalChat/examples/training_scripts/train_kto.py
@@ -0,0 +1,376 @@
+import argparse
+import json
+import os
+import resource
+from contextlib import nullcontext
+
+import torch
+from coati.dataset import DataCollatorForKTODataset, StatefulDistributedSampler, load_tokenized_dataset
+from coati.models import LoraConfig, convert_to_lora_module, disable_dropout
+from coati.trainer import KTOTrainer
+from coati.utils import load_checkpoint
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.logging import get_dist_logger
+from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
+from colossalai.nn.optimizer import HybridAdam
+
+logger = get_dist_logger()
+
+
+def train(args):
+    lora_config = None
+    if args.lora_config is not None:
+        lora_config = LoraConfig.from_file(args.lora_config)
+    # check lora compatibility
+    if "gemini" in args.plugin and lora_config is not None and lora_config.r > 0:
+        raise ValueError("LoRA is not supported in GeminiPlugin. Please use other plugin")
+    if args.plugin == "gemini_auto" and args.accumulation_steps > 1:
+        raise ValueError("Gradient accumulation is not supported in GeminiPlugin. Please use other plugin")
+
+    # ==============================
+    # Initialize Distributed Training
+    # ==============================
+    colossalai.launch_from_torch()
+    coordinator = DistCoordinator()
+
+    # ==============================
+    # Initialize Booster
+    # ==============================
+    if args.plugin == "ddp":
+        """
+        Default torch ddp plugin without any acceleration, for
+        debugging purpose acceleration, for debugging purpose
+        """
+        plugin = TorchDDPPlugin(find_unused_parameters=True)
+    elif args.plugin == "gemini":
+        plugin = GeminiPlugin(
+            precision=args.mixed_precision,
+            placement_policy="static",
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+            enable_gradient_accumulation=True,
+            enable_flash_attention=args.use_flash_attn,
+        )
+    elif args.plugin == "gemini_auto":
+        plugin = GeminiPlugin(
+            precision=args.mixed_precision,
+            placement_policy="auto",
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+            enable_flash_attention=args.use_flash_attn,
+        )
+    elif args.plugin == "zero2":
+        plugin = LowLevelZeroPlugin(
+            stage=2,
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "zero2_cpu":
+        plugin = LowLevelZeroPlugin(
+            stage=2,
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            cpu_offload=True,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "3d":
+        plugin = HybridParallelPlugin(
+            tp_size=args.tp,
+            pp_size=args.pp,
+            sp_size=args.sp,
+            sequence_parallelism_mode=args.sp_mode,
+            zero_stage=args.zero_stage,
+            enable_flash_attention=args.use_flash_attn,
+            enable_sequence_parallelism=args.enable_sequence_parallelism,
+            cpu_offload=True if args.zero_stage >= 1 and args.zero_cpu_offload else False,
+            parallel_output=False,
+            max_norm=args.grad_clip,
+            precision=args.mixed_precision,
+        )
+    else:
+        raise ValueError(f"Unknown plugin {args.plugin}")
+
+    booster = Booster(plugin=plugin)
+    ref_booster = Booster(plugin=plugin)
+
+    # ======================================================
+    # Initialize Model, Objective, Optimizer and LR Scheduler
+    # ======================================================
+    # Temp Fix: Disable lazy init due to version conflict
+    # init_ctx = (
+    #     LazyInitContext(default_device=get_current_device()) if isinstance(plugin, (GeminiPlugin,)) else nullcontext()
+    # )
+
+    init_ctx = nullcontext()
+    with init_ctx:
+        if args.use_flash_attn:
+            model = AutoModelForCausalLM.from_pretrained(
+                args.pretrain,
+                torch_dtype=torch.bfloat16 if args.mixed_precision == "bf16" else torch.float16,
+                use_flash_attention_2=True,
+            )
+            coordinator.print_on_master(msg="Flash-attention enabled successfully")
+        else:
+            model = AutoModelForCausalLM.from_pretrained(args.pretrain)
+
+        if args.use_flash_attn:
+            ref_model = AutoModelForCausalLM.from_pretrained(
+                args.pretrain,
+                torch_dtype=torch.bfloat16 if args.mixed_precision == "bf16" else torch.float16,
+                use_flash_attention_2=True,
+            )
+        else:
+            ref_model = AutoModelForCausalLM.from_pretrained(args.pretrain)
+        if args.lora_config is not None:
+            model = convert_to_lora_module(model, lora_config=lora_config)
+            for name, module in model.named_modules():
+                if "norm" in name or "gate" in name:
+                    module = module.to(torch.float32)
+        disable_dropout(ref_model)
+        disable_dropout(model)
+
+    if args.grad_checkpoint:
+        # Note, for some models, lora may not be compatible with gradient checkpointing
+        model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
+        coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
+
+    # configure tokenizer
+    tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=False, trust_remote_code=True)
+    if hasattr(tokenizer, "pad_token") and hasattr(tokenizer, "eos_token") and tokenizer.eos_token is not None:
+        try:
+            # Some tokenizers doesn't allow to set pad_token mannually e.g., Qwen
+            tokenizer.pad_token = tokenizer.eos_token
+        except AttributeError as e:
+            logger.warning(f"Unable to set pad token to eos token, {str(e)}")
+    if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None:
+        logger.warning(
+            "The tokenizer does not have a pad token which is required. May lead to unintended behavior in training, Please consider manually set them."
+        )
+
+    tokenizer.add_bos_token = False
+    tokenizer.add_eos_token = False
+
+    # configure optimizer
+    optim = HybridAdam(
+        model_params=model.parameters(),
+        lr=args.lr,
+        betas=(0.9, 0.95),
+        weight_decay=args.weight_decay,
+        adamw_mode=True,
+    )
+
+    # configure dataset
+    coordinator.print_on_master(f"Load dataset: {args.dataset}")
+    mode_map = {"train": "train", "valid": "validation", "test": "test"}
+    train_dataset = load_tokenized_dataset(dataset_paths=args.dataset, mode="train", mode_map=mode_map)
+    num_desirable = 0
+    num_undesirable = 0
+    for i in range(len(train_dataset)):
+        if train_dataset[i]["label"]:
+            num_desirable += 1
+        else:
+            num_undesirable += 1
+    logger.info(f"Dataset Statistics:\nDesirable: {num_desirable}\nUndesirable: {num_undesirable}")
+
+    # Check if the user specified weights fit into the theoratical lower and upper bounds from Eq. (8) of https://arxiv.org/abs/2402.01306
+    actual_ratio = (args.desirable_weight * num_desirable) / (args.undesirable_weight * num_undesirable)
+    if actual_ratio < 1 or actual_ratio > 4 / 3:
+        if not args.auto_weight:
+            raise AssertionError(
+                f"Desirable weight and undesirable weight are not within the theoratical bounds, [1, 4/3]. Actual ratio: {actual_ratio}, please increase/decrease desirable weight or decrease/increase undesirable weight."
+            )
+        else:
+            args.desirable_weight = args.desirable_weight / actual_ratio
+            coordinator.print_on_master(
+                f"Desirable weight and undesirable weight are not within the theoratical bounds, [1, 4/3]. Actual ratio: {actual_ratio}, auto weight is enabled, set desirable weight to {args.desirable_weight} and undesirable weight to {args.undesirable_weight}"
+            )
+
+    data_collator = DataCollatorForKTODataset(tokenizer=tokenizer, max_length=args.max_length)
+
+    train_dataloader = plugin.prepare_dataloader(
+        dataset=train_dataset,
+        batch_size=args.batch_size,
+        shuffle=True,
+        drop_last=True,
+        collate_fn=data_collator,
+        distributed_sampler_cls=StatefulDistributedSampler,
+    )
+    eval_dataloader = None
+    if args.eval_dataset:
+        eval_dataset = load_tokenized_dataset(dataset_paths=args.eval_dataset, mode="dev")
+        eval_data_collator = DataCollatorForKTODataset(tokenizer=tokenizer, max_length=args.max_length)
+
+        eval_dataloader = plugin.prepare_dataloader(
+            dataset=eval_dataset,
+            batch_size=args.batch_size,
+            shuffle=True,
+            drop_last=True,
+            collate_fn=eval_data_collator,
+            distributed_sampler_cls=StatefulDistributedSampler,
+        )
+    else:
+        logger.warning("No evaluation dataset is provided, skip evaluation")
+
+    num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps
+    if args.warmup_steps is None:
+        args.warmup_steps = int(args.max_epochs * 0.025 * (len(train_dataloader) // args.accumulation_steps))
+        coordinator.print_on_master(f"Warmup steps is set to {args.warmup_steps}")
+
+    lr_scheduler = CosineAnnealingWarmupLR(
+        optimizer=optim,
+        total_steps=args.max_epochs * num_update_steps_per_epoch,
+        warmup_steps=args.warmup_steps,
+        eta_min=0.1 * args.lr,
+    )
+
+    default_dtype = torch.float16 if args.mixed_precision == "fp16" else torch.bfloat16
+    torch.set_default_dtype(default_dtype)
+    model, optim, _, train_dataloader, lr_scheduler = booster.boost(
+        model=model,
+        optimizer=optim,
+        lr_scheduler=lr_scheduler,
+        dataloader=train_dataloader,
+    )
+    if ref_model is not None:
+        ref_model, _, _, _, _ = ref_booster.boost(model=ref_model, dataloader=train_dataloader)
+    torch.set_default_dtype(torch.float)
+
+    coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
+    coordinator.print_on_master(
+        f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+    )
+
+    start_epoch = 0
+    sampler_start_idx = 0
+    start_step = 0
+    if args.checkpoint_path is not None:
+        if "modeling" in args.checkpoint_path:
+            coordinator.print_on_master(f"Continued pretrain from checkpoint {args.checkpoint_path}")
+            booster.load_model(model, args.checkpoint_path)
+        else:
+            coordinator.print_on_master(f"Load model checkpoint from {args.checkpoint_path}")
+            start_epoch, start_step, sampler_start_idx = load_checkpoint(
+                load_dir=args.checkpoint_path,
+                booster=booster,
+                model=model,
+                optimizer=optim,
+                lr_scheduler=lr_scheduler,
+            )
+            assert isinstance(train_dataloader.sampler, StatefulDistributedSampler)
+            train_dataloader.sampler.set_start_index(start_index=sampler_start_idx)
+
+            coordinator.print_on_master(
+                f"Loaded checkpoint {args.checkpoint_path} at epoch {start_epoch} step {start_step}"
+            )
+            coordinator.print_on_master(f"Loaded sample at index {sampler_start_idx}")
+
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+        )
+
+    trainer = KTOTrainer(
+        actor=model,
+        ref_model=ref_model,
+        booster=booster,
+        actor_optim=optim,
+        actor_lr_scheduler=lr_scheduler,
+        tokenizer=tokenizer,
+        max_epochs=args.max_epochs,
+        accumulation_steps=args.accumulation_steps,
+        start_epoch=start_epoch,
+        save_interval=args.save_interval,
+        save_dir=args.save_dir,
+        coordinator=coordinator,
+        beta=args.beta,
+        desirable_weight=args.desirable_weight,
+        undesirable_weight=args.undesirable_weight,
+    )
+
+    trainer.fit(
+        train_preference_dataloader=train_dataloader,
+        eval_preference_dataloader=eval_dataloader,
+        log_dir=args.log_dir,
+        use_wandb=args.use_wandb,
+    )
+
+    if lora_config is not None and lora_config.r > 0:
+        # NOTE: set model to eval to merge LoRA weights
+        model.eval()
+    # save model checkpoint after fitting on only rank0
+    if args.save_dir is not None:
+        coordinator.print_on_master("Start saving final model checkpoint")
+        booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True)
+        coordinator.print_on_master(
+            f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_dir}"
+        )
+
+    coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
+
+
+if __name__ == "__main__":
+    # ==============================
+    # Parse Arguments
+    # ==============================
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--plugin",
+        type=str,
+        default="gemini",
+        choices=["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d"],
+        help="Choose which plugin to use",
+    )
+    parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value")
+    parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay")
+    parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps")
+    parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--pp", type=int, default=1)
+    parser.add_argument("--sp", type=int, default=1)
+    parser.add_argument("--beta", type=float, default=0.1, help="beta in KTO loss")
+    parser.add_argument("--desirable_weight", type=float, default=1.0, help="desirable_weight in KTO loss")
+    parser.add_argument("--undesirable_weight", type=float, default=1.0, help="undesirable_weight in KTO loss")
+    parser.add_argument("--enable_sequence_parallelism", default=False, action="store_true")
+    parser.add_argument("--zero_stage", type=int, default=0, help="Zero stage", choices=[0, 1, 2])
+    parser.add_argument("--zero_cpu_offload", default=False, action="store_true")
+    parser.add_argument("--sp_mode", type=str, default="split_gather", choices=["split_gather", "ring", "all_to_all"])
+    parser.add_argument("--pretrain", type=str, default=None)
+    parser.add_argument("--tokenizer_dir", type=str, default=None)
+    parser.add_argument("--dataset", nargs="+", default=[])
+    parser.add_argument("--eval_dataset", nargs="+", default=[])
+    parser.add_argument(
+        "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint"
+    )
+    parser.add_argument("--config_file", type=str, default=None, help="Config file")
+    parser.add_argument("--save_dir", type=str, default=None)
+    parser.add_argument("--max_length", type=int, default=2048, help="Model max length")
+    parser.add_argument("--max_epochs", type=int, default=3)
+    parser.add_argument("--batch_size", type=int, default=4)
+
+    parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision")
+    parser.add_argument("--lora_config", type=str, default=None, help="low-rank adaptation config file path")
+    parser.add_argument("--save_interval", type=int, default=1000, help="number of step between two checkpoints")
+    parser.add_argument("--auto_weight", default=False, action="store_true")
+    parser.add_argument("--lr", type=float, default=5e-6)
+    parser.add_argument("--accumulation_steps", type=int, default=8)
+    parser.add_argument("--log_dir", default=None, type=str)
+    parser.add_argument("--use_wandb", default=False, action="store_true")
+    parser.add_argument("--grad_checkpoint", default=False, action="store_true")
+    parser.add_argument("--use_flash_attn", default=False, action="store_true")
+    args = parser.parse_args()
+    if args.config_file is not None:
+        os.makedirs(os.path.dirname(args.config_file), exist_ok=True)
+        with open(args.config_file, "w") as f:
+            json.dump(args.__dict__, f, indent=4)
+    train(args)
diff --git a/applications/ColossalChat/examples/training_scripts/train_kto.sh b/applications/ColossalChat/examples/training_scripts/train_kto.sh
new file mode 100755
index 000000000000..c28338c220dd
--- /dev/null
+++ b/applications/ColossalChat/examples/training_scripts/train_kto.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
+        tail -n +2 |
+        nl -v 0 |
+        tee /dev/tty |
+        sort -g -k 2 |
+        awk '{print $1}' |
+        head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
+set_n_least_used_CUDA_VISIBLE_DEVICES 4
+
+PROJECT_NAME="kto"
+PARENT_SAVE_DIR="" # Path to a folder to save checkpoints
+PARENT_TENSORBOARD_DIR="" # Path to a folder to save logs
+PARENT_CONFIG_FILE="" # Path to a folder to save training config logs
+PARENT_LOG_DIR="" # Path to a folder to save training config logs
+PRETRAINED_MODEL_PATH="" # huggingface or local model path
+PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path
+
+declare -a dataset=(
+    /Your/KTO/Data/arrow/part-00000
+    /Your/KTO/Data/arrow/part-00001
+    /Your/KTO/Data/arrow/part-00002
+    /Your/KTO/Data/arrow/part-00003
+    /Your/KTO/Data/arrow/part-00004
+    /Your/KTO/Data/arrow/part-00005
+    /Your/KTO/Data/arrow/part-00006
+    /Your/KTO/Data/arrow/part-00007
+    /Your/KTO/Data/arrow/part-00008
+    /Your/KTO/Data/arrow/part-00009
+)
+
+TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
+FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
+SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
+CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
+LOG_DIR="${PARENT_LOG_DIR}${FULL_PROJECT_NAME}"
+
+colossalai run --nproc_per_node 4 --master_port 31313 train_kto.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --dataset ${dataset[@]} \
+    --plugin "zero2" \
+    --save_interval 1000 \
+    --save_dir $SAVE_DIR \
+    --config_file $CONFIG_FILE \
+    --log_dir $LOG_DIR \
+    --max_epochs 1 \
+    --accumulation_steps 1 \
+    --batch_size 8 \
+    --auto_weight \
+    --lr 1e-5 \
+    --beta 0.1 \
+    --mixed_precision "bf16" \
+    --grad_clip 1.0 \
+    --max_length 1024 \
+    --weight_decay 0.01 \
+    --warmup_steps 60 \
+    --grad_checkpoint
diff --git a/applications/ColossalChat/examples/training_scripts/train_orpo.py b/applications/ColossalChat/examples/training_scripts/train_orpo.py
new file mode 100755
index 000000000000..f06524507d5f
--- /dev/null
+++ b/applications/ColossalChat/examples/training_scripts/train_orpo.py
@@ -0,0 +1,341 @@
+import argparse
+import json
+import os
+import resource
+from contextlib import nullcontext
+
+import torch
+from coati.dataset import DataCollatorForPreferenceDataset, StatefulDistributedSampler, load_tokenized_dataset
+from coati.models import LoraConfig, convert_to_lora_module, disable_dropout
+from coati.trainer import ORPOTrainer
+from coati.utils import load_checkpoint
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.logging import get_dist_logger
+from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
+from colossalai.nn.optimizer import HybridAdam
+
+logger = get_dist_logger()
+
+
+def train(args):
+    lora_config = None
+    if args.lora_config is not None:
+        lora_config = LoraConfig.from_file(args.lora_config)
+    # check lora compatibility
+    if "gemini" in args.plugin and lora_config is not None and lora_config.r > 0:
+        raise ValueError("LoRA is not supported in GeminiPlugin. Please use other plugin")
+    if args.plugin == "gemini_auto" and args.accumulation_steps > 1:
+        raise ValueError("Gradient accumulation is not supported in GeminiPlugin. Please use other plugin")
+
+    # ==============================
+    # Initialize Distributed Training
+    # ==============================
+    colossalai.launch_from_torch()
+    coordinator = DistCoordinator()
+
+    # ==============================
+    # Initialize Booster
+    # ==============================
+    if args.plugin == "ddp":
+        """
+        Default torch ddp plugin without any acceleration, for
+        debugging purpose acceleration, for debugging purpose
+        """
+        plugin = TorchDDPPlugin(find_unused_parameters=True)
+    elif args.plugin == "gemini":
+        plugin = GeminiPlugin(
+            precision=args.mixed_precision,
+            placement_policy="static",
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+            enable_gradient_accumulation=True,
+            enable_flash_attention=args.use_flash_attn,
+        )
+    elif args.plugin == "gemini_auto":
+        plugin = GeminiPlugin(
+            precision=args.mixed_precision,
+            placement_policy="auto",
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+            enable_flash_attention=args.use_flash_attn,
+        )
+    elif args.plugin == "zero2":
+        plugin = LowLevelZeroPlugin(
+            stage=2,
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "zero2_cpu":
+        plugin = LowLevelZeroPlugin(
+            stage=2,
+            precision=args.mixed_precision,
+            initial_scale=2**16,
+            cpu_offload=True,
+            max_norm=args.grad_clip,
+        )
+    elif args.plugin == "3d":
+        plugin = HybridParallelPlugin(
+            tp_size=args.tp,
+            pp_size=args.pp,
+            sp_size=args.sp,
+            sequence_parallelism_mode=args.sp_mode,
+            zero_stage=args.zero_stage,
+            enable_flash_attention=args.use_flash_attn,
+            enable_sequence_parallelism=args.enable_sequence_parallelism,
+            cpu_offload=True if args.zero_stage >= 1 and args.zero_cpu_offload else False,
+            parallel_output=False,
+            max_norm=args.grad_clip,
+            precision=args.mixed_precision,
+        )
+    else:
+        raise ValueError(f"Unknown plugin {args.plugin}")
+
+    booster = Booster(plugin=plugin)
+
+    # ======================================================
+    # Initialize Model, Objective, Optimizer and LR Scheduler
+    # ======================================================
+    # Temp Fix: Disable lazy init due to version conflict
+    # init_ctx = (
+    #     LazyInitContext(default_device=get_current_device()) if isinstance(plugin, (GeminiPlugin,)) else nullcontext()
+    # )
+
+    init_ctx = nullcontext()
+    with init_ctx:
+        if args.use_flash_attn:
+            model = AutoModelForCausalLM.from_pretrained(
+                args.pretrain,
+                torch_dtype=torch.bfloat16 if args.mixed_precision == "bf16" else torch.float16,
+                use_flash_attention_2=True,
+            )
+            coordinator.print_on_master(msg="Flash-attention enabled successfully")
+        else:
+            model = AutoModelForCausalLM.from_pretrained(args.pretrain)
+        if args.lora_config is not None:
+            model = convert_to_lora_module(model, lora_config=lora_config)
+            for name, module in model.named_modules():
+                if "norm" in name or "gate" in name:
+                    module = module.to(torch.float32)
+        disable_dropout(model)
+
+    if args.grad_checkpoint:
+        # Note, for some models, lora may not be compatible with gradient checkpointing
+        model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
+        coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
+
+    # configure tokenizer
+    tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=False, trust_remote_code=True)
+    if hasattr(tokenizer, "pad_token") and hasattr(tokenizer, "eos_token") and tokenizer.eos_token is not None:
+        try:
+            # Some tokenizers doesn't allow to set pad_token mannually e.g., Qwen
+            tokenizer.pad_token = tokenizer.eos_token
+        except AttributeError as e:
+            logger.warning(f"Unable to set pad token to eos token, {str(e)}")
+    if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None:
+        logger.warning(
+            "The tokenizer does not have a pad token which is required. May lead to unintended behavior in training, Please consider manually set them."
+        )
+
+    tokenizer.add_bos_token = False
+    tokenizer.add_eos_token = False
+
+    # configure optimizer
+    optim = HybridAdam(
+        model_params=model.parameters(),
+        lr=args.lr,
+        betas=(0.9, 0.95),
+        weight_decay=args.weight_decay,
+        adamw_mode=True,
+    )
+
+    # configure dataset
+    coordinator.print_on_master(f"Load dataset: {args.dataset}")
+    mode_map = {"train": "train", "valid": "validation", "test": "test"}
+    train_dataset = load_tokenized_dataset(dataset_paths=args.dataset, mode="train", mode_map=mode_map)
+    data_collator = DataCollatorForPreferenceDataset(tokenizer=tokenizer, max_length=args.max_length)
+
+    train_dataloader = plugin.prepare_dataloader(
+        dataset=train_dataset,
+        batch_size=args.batch_size,
+        shuffle=True,
+        drop_last=True,
+        collate_fn=data_collator,
+        distributed_sampler_cls=StatefulDistributedSampler,
+    )
+
+    eval_dataloader = None
+    if args.eval_dataset:
+        eval_dataset = load_tokenized_dataset(dataset_paths=args.eval_dataset, mode="dev")
+        eval_data_collator = DataCollatorForPreferenceDataset(tokenizer=tokenizer, max_length=args.max_length)
+        eval_dataloader = plugin.prepare_dataloader(
+            dataset=eval_dataset,
+            batch_size=args.batch_size,
+            shuffle=True,
+            drop_last=True,
+            collate_fn=eval_data_collator,
+            distributed_sampler_cls=StatefulDistributedSampler,
+        )
+    else:
+        logger.warning("No evaluation dataset is provided, skip evaluation")
+
+    num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps
+    if args.warmup_steps is None:
+        args.warmup_steps = int(args.max_epochs * 0.025 * (len(train_dataloader) // args.accumulation_steps))
+        coordinator.print_on_master(f"Warmup steps is set to {args.warmup_steps}")
+
+    lr_scheduler = CosineAnnealingWarmupLR(
+        optimizer=optim,
+        total_steps=args.max_epochs * num_update_steps_per_epoch,
+        warmup_steps=args.warmup_steps,
+        eta_min=0.1 * args.lr,
+    )
+
+    default_dtype = torch.float16 if args.mixed_precision == "fp16" else torch.bfloat16
+    torch.set_default_dtype(default_dtype)
+    model, optim, _, train_dataloader, lr_scheduler = booster.boost(
+        model=model,
+        optimizer=optim,
+        lr_scheduler=lr_scheduler,
+        dataloader=train_dataloader,
+    )
+    torch.set_default_dtype(torch.float)
+
+    coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
+    coordinator.print_on_master(
+        f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+    )
+
+    start_epoch = 0
+    sampler_start_idx = 0
+    start_step = 0
+    if args.checkpoint_path is not None:
+        if "modeling" in args.checkpoint_path:
+            coordinator.print_on_master(f"Continued pretrain from checkpoint {args.checkpoint_path}")
+            booster.load_model(model, args.checkpoint_path)
+        else:
+            coordinator.print_on_master(f"Load model checkpoint from {args.checkpoint_path}")
+            start_epoch, start_step, sampler_start_idx = load_checkpoint(
+                load_dir=args.checkpoint_path,
+                booster=booster,
+                model=model,
+                optimizer=optim,
+                lr_scheduler=lr_scheduler,
+            )
+            assert isinstance(train_dataloader.sampler, StatefulDistributedSampler)
+            train_dataloader.sampler.set_start_index(start_index=sampler_start_idx)
+
+            coordinator.print_on_master(
+                f"Loaded checkpoint {args.checkpoint_path} at epoch {start_epoch} step {start_step}"
+            )
+            coordinator.print_on_master(f"Loaded sample at index {sampler_start_idx}")
+
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded CUDA memory: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB"
+        )
+        coordinator.print_on_master(
+            f"Checkpoint loaded max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024:.2f} MB"
+        )
+
+    trainer = ORPOTrainer(
+        actor=model,
+        booster=booster,
+        actor_optim=optim,
+        actor_lr_scheduler=lr_scheduler,
+        tokenizer=tokenizer,
+        max_epochs=args.max_epochs,
+        accumulation_steps=args.accumulation_steps,
+        start_epoch=start_epoch,
+        save_interval=args.save_interval,
+        save_dir=args.save_dir,
+        coordinator=coordinator,
+        lam=args.lam,
+    )
+
+    trainer.fit(
+        train_preference_dataloader=train_dataloader,
+        eval_preference_dataloader=eval_dataloader,
+        log_dir=args.log_dir,
+        use_wandb=args.use_wandb,
+    )
+
+    if lora_config is not None and lora_config.r > 0:
+        # NOTE: set model to eval to merge LoRA weights
+        model.eval()
+    # save model checkpoint after fitting on only rank0
+    if args.save_dir is not None:
+        coordinator.print_on_master("Start saving final model checkpoint")
+        booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True)
+        coordinator.print_on_master(
+            f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_dir}"
+        )
+
+    coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
+
+
+if __name__ == "__main__":
+    # ==============================
+    # Parse Arguments
+    # ==============================
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--plugin",
+        type=str,
+        default="gemini",
+        choices=["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d"],
+        help="Choose which plugin to use",
+    )
+    parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value")
+    parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay")
+    parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps")
+    parser.add_argument("--tp", type=int, default=1)
+    parser.add_argument("--pp", type=int, default=1)
+    parser.add_argument("--sp", type=int, default=1)
+    parser.add_argument("--lam", type=float, default=0.1, help="lambda in ORPO loss")
+    parser.add_argument("--enable_sequence_parallelism", default=False, action="store_true")
+    parser.add_argument("--zero_stage", type=int, default=0, help="Zero stage", choices=[0, 1, 2])
+    parser.add_argument("--zero_cpu_offload", default=False, action="store_true")
+    parser.add_argument("--sp_mode", type=str, default="split_gather", choices=["split_gather", "ring", "all_to_all"])
+    parser.add_argument("--pretrain", type=str, default=None)
+    parser.add_argument("--model_type", type=str, default=None)
+    parser.add_argument("--tokenizer_dir", type=str, default=None)
+    parser.add_argument("--dataset", nargs="+", default=[])
+    parser.add_argument("--eval_dataset", nargs="+", default=[])
+    parser.add_argument(
+        "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint"
+    )
+    parser.add_argument("--config_file", type=str, default=None, help="Config file")
+    parser.add_argument("--save_dir", type=str, default=None)
+    parser.add_argument("--max_length", type=int, default=2048, help="Model max length")
+    parser.add_argument("--max_epochs", type=int, default=3)
+    parser.add_argument("--batch_size", type=int, default=4)
+    parser.add_argument(
+        "--disable_reference_model",
+        action="store_true",
+        default=False,
+        help="Disable the reference model (enabled by default)",
+    )
+    parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision")
+    parser.add_argument("--lora_config", type=str, default=None, help="low-rank adaptation config file path")
+    parser.add_argument("--save_interval", type=int, default=1000, help="number of step between two checkpoints")
+    parser.add_argument("--lr", type=float, default=5e-6)
+    parser.add_argument("--accumulation_steps", type=int, default=8)
+    parser.add_argument("--log_dir", default=None, type=str)
+    parser.add_argument("--use_wandb", default=False, action="store_true")
+    parser.add_argument("--grad_checkpoint", default=False, action="store_true")
+    parser.add_argument("--use_flash_attn", default=False, action="store_true")
+    args = parser.parse_args()
+    if args.config_file is not None:
+        os.makedirs(os.path.dirname(args.config_file), exist_ok=True)
+        with open(args.config_file, "w") as f:
+            json.dump(args.__dict__, f, indent=4)
+    train(args)
diff --git a/applications/ColossalChat/examples/training_scripts/train_orpo.sh b/applications/ColossalChat/examples/training_scripts/train_orpo.sh
new file mode 100755
index 000000000000..48327e014adf
--- /dev/null
+++ b/applications/ColossalChat/examples/training_scripts/train_orpo.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
+        tail -n +2 |
+        nl -v 0 |
+        tee /dev/tty |
+        sort -g -k 2 |
+        awk '{print $1}' |
+        head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
+set_n_least_used_CUDA_VISIBLE_DEVICES 2
+
+PROJECT_NAME="ORPO"
+PARENT_SAVE_DIR="" # Path to a folder to save checkpoints
+PARENT_CONFIG_FILE="" # Path to a folder to save training config logs
+PARENT_LOG_DIR="" # Path to a folder to save training config logs
+PRETRAINED_MODEL_PATH="" # huggingface or local model path
+PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path
+
+declare -a dataset=(
+    /Your/Preference/Data/arrow/part-00000
+    /Your/Preference/Data/arrow/part-00001
+    /Your/Preference/Data/arrow/part-00002
+    /Your/Preference/Data/arrow/part-00003
+    /Your/Preference/Data/arrow/part-00004
+    /Your/Preference/Data/arrow/part-00005
+    /Your/Preference/Data/arrow/part-00006
+    /Your/Preference/Data/arrow/part-00007
+    /Your/Preference/Data/arrow/part-00008
+    /Your/Preference/Data/arrow/part-00009
+)
+
+TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
+FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
+SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
+CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json"
+LOG_DIR="${PARENT_LOG_DIR}${FULL_PROJECT_NAME}"
+
+colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 31313 train_orpo.py \
+    --pretrain $PRETRAINED_MODEL_PATH \
+    --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
+    --dataset ${dataset[@]} \
+    --plugin "zero2" \
+    --save_interval 1000 \
+    --save_dir $SAVE_DIR \
+    --config_file $CONFIG_FILE \
+    --log_dir $LOG_DIR \
+    --max_epochs 3 \
+    --accumulation_steps 1 \
+    --batch_size 16 \
+    --lr 8e-6 \
+    --lam 0.5 \
+    --mixed_precision "bf16" \
+    --grad_clip 1.0 \
+    --max_length 1024 \
+    --weight_decay 0.01 \
+    --warmup_steps 60 \
+    --grad_checkpoint \
+    --use_wandb
diff --git a/applications/ColossalChat/examples/training_scripts/train_ppo.py b/applications/ColossalChat/examples/training_scripts/train_ppo.py
index 3da3e9ca641e..333be9963c06 100755
--- a/applications/ColossalChat/examples/training_scripts/train_ppo.py
+++ b/applications/ColossalChat/examples/training_scripts/train_ppo.py
@@ -13,7 +13,7 @@
     load_tokenized_dataset,
     setup_conversation_template,
 )
-from coati.models import Critic, RewardModel, convert_to_lora_module, disable_dropout
+from coati.models import Critic, LoraConfig, RewardModel, convert_to_lora_module, disable_dropout, lora_manager
 from coati.trainer import PPOTrainer
 from coati.utils import load_checkpoint
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -31,8 +31,11 @@
 
 
 def train(args):
+    lora_config = None
+    if args.lora_config is not None:
+        lora_config = LoraConfig.from_file(args.lora_config)
     # check lora compatibility
-    if "gemini" in args.plugin and args.lora_rank > 0:
+    if "gemini" in args.plugin and lora_config is not None and lora_config.r > 0:
         raise ValueError("LoRA is not supported in GeminiPlugin. Please use other plugin")
     if args.plugin == "gemini_auto" and args.accumulation_steps > 1:
         raise ValueError("Gradient accumulation is not supported in GeminiPlugin. Please use other plugin")
@@ -81,20 +84,26 @@ def train(args):
             ref_model = AutoModelForCausalLM.from_pretrained(args.pretrain, local_files_only=True)
             reward_model = RewardModel(args.rm_pretrain)
             critic = Critic(args.rm_pretrain)
+
+        if args.lora_config is not None:
+            actor = convert_to_lora_module(actor, lora_config=lora_config)
+            critic = convert_to_lora_module(critic, lora_config=lora_config)
+            for name, module in actor.named_modules():
+                if "norm" in name or "gate" in name:
+                    module = module.to(torch.float32)
+            for name, module in critic.named_modules():
+                if "norm" in name or "gate" in name:
+                    module = module.to(torch.float32)
+            lora_manager.able_to_merge = False
+
         # Disable dropout
         disable_dropout(actor)
         disable_dropout(critic)
 
-        if args.lora_rank > 0:
-            actor = convert_to_lora_module(actor, args.lora_rank, lora_train_bias=args.lora_train_bias)
-            critic = convert_to_lora_module(critic, args.lora_rank, lora_train_bias=args.lora_train_bias)
-
-    if args.grad_checkpoint and args.lora_rank == 0:
-        actor.gradient_checkpointing_enable()
-        critic.model.gradient_checkpointing_enable()
+    if args.grad_checkpoint:
+        actor.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
+        critic.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
         coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
-    elif args.lora_rank > 0:
-        coordinator.print_on_master(msg="Gradient checkpointing will be disabled when LoRA is enabled")
 
     # configure tokenizer
     tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain
@@ -421,11 +430,9 @@ def train(args):
         use_wandb=args.use_wandb,
     )
 
-    if args.lora_rank > 0 and args.merge_lora_weights:
-        from coati.models.lora import LORA_MANAGER
-
+    if lora_config is not None and lora_config.r > 0:
         # NOTE: set model to eval to merge LoRA weights
-        LORA_MANAGER.merge_weights = True
+        lora_manager.able_to_merge = True
         actor.eval()
         critic.eval()
     # save model checkpoint after fitting on only rank0
@@ -484,11 +491,9 @@ def train(args):
     parser.add_argument("--train_batch_size", type=int, default=16)
     parser.add_argument("--experience_batch_size", type=int, default=16)
     parser.add_argument("--ptx_batch_size", type=int, default=4)
-    parser.add_argument("--lora_train_bias", type=str, default="none")
+    parser.add_argument("--lora_config", type=str, default=None, help="low-rank adaptation config file path")
     parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision")
     parser.add_argument("--accumulation_steps", type=int, default=8)
-    parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
-    parser.add_argument("--merge_lora_weights", type=bool, default=True)
     parser.add_argument("--lr", type=float, default=9e-6)
     parser.add_argument("--critic_lr", type=float, default=9e-6)
     parser.add_argument("--kl_coef", type=float, default=0.1)
diff --git a/applications/ColossalChat/examples/training_scripts/train_ppo.sh b/applications/ColossalChat/examples/training_scripts/train_ppo.sh
index 91633978e6ff..277e75e6de56 100755
--- a/applications/ColossalChat/examples/training_scripts/train_ppo.sh
+++ b/applications/ColossalChat/examples/training_scripts/train_ppo.sh
@@ -15,10 +15,9 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
 }
 set_n_least_used_CUDA_VISIBLE_DEVICES 8
 
-PROJECT_NAME="ppo"
+PROJECT_NAME="PPO"
 
 PARENT_SAVE_DIR="" # Path to a folder to save checkpoints
-PARENT_TENSORBOARD_DIR="" # Path to a folder to save logs
 PARENT_CONFIG_FILE="" # Path to a folder to save training config logs
 PRETRAINED_MODEL_PATH="" # local pretrained model path (from RLHF step 1: SFT)
 PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path
@@ -54,7 +53,7 @@ declare -a ptx_dataset=(
 TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
 FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
 SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
-CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
+CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json"
 
 colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 31312 train_ppo.py \
     --pretrain $PRETRAINED_MODEL_PATH \
diff --git a/applications/ColossalChat/examples/training_scripts/train_rm.py b/applications/ColossalChat/examples/training_scripts/train_rm.py
index ce0d02b5d2a4..4c0a782b4766 100755
--- a/applications/ColossalChat/examples/training_scripts/train_rm.py
+++ b/applications/ColossalChat/examples/training_scripts/train_rm.py
@@ -7,7 +7,7 @@
 
 import torch
 from coati.dataset import DataCollatorForPreferenceDataset, StatefulDistributedSampler, load_tokenized_dataset
-from coati.models import LogExpLoss, LogSigLoss, RewardModel, convert_to_lora_module
+from coati.models import LogExpLoss, LogSigLoss, LoraConfig, RewardModel, convert_to_lora_module
 from coati.trainer import RewardModelTrainer
 from coati.utils import load_checkpoint
 from transformers import AutoTokenizer
@@ -16,14 +16,20 @@
 from colossalai.booster import Booster
 from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin, TorchDDPPlugin
 from colossalai.cluster import DistCoordinator
+from colossalai.logging import get_dist_logger
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.shardformer.policies.auto_policy import get_autopolicy
 
+logger = get_dist_logger()
+
 
 def train(args):
+    lora_config = None
+    if args.lora_config is not None:
+        lora_config = LoraConfig.from_file(args.lora_config)
     # check lora compatibility
-    if "gemini" in args.plugin and args.lora_rank > 0:
+    if "gemini" in args.plugin and lora_config is not None and lora_config.r > 0:
         raise ValueError("LoRA is not supported in GeminiPlugin. Please use other plugin")
     if args.plugin == "gemini_auto" and args.accumulation_steps > 1:
         raise ValueError("Gradient accumulation is not supported in GeminiPlugin. Please use other plugin")
@@ -55,9 +61,11 @@ def train(args):
                 args.pretrain,
             )
 
-        if args.lora_rank > 0:
-            model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias)
-
+        if lora_config is not None:
+            model = convert_to_lora_module(model, lora_config=lora_config)
+            for name, module in model.named_modules():
+                if "norm" in name or "gate" in name:
+                    module = module.to(torch.float32)
     # ==============================
     # Initialize Booster
     # ==============================
@@ -119,11 +127,9 @@ def train(args):
 
     booster = Booster(plugin=plugin)
 
-    if args.grad_checkpoint and args.lora_rank == 0:
-        model.model.gradient_checkpointing_enable()  # TODO: support gradient checkpoint for the last linear layer
+    if args.grad_checkpoint:
+        model.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
         coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
-    elif args.lora_rank > 0:
-        coordinator.print_on_master(msg="Gradient checkpointing will be disabled when LoRA is enabled")
 
     # configure tokenizer
     tokenizer_dir = args.tokenizer_dir if args.tokenizer_dir is not None else args.pretrain
@@ -173,6 +179,22 @@ def train(args):
         collate_fn=data_collator,
         distributed_sampler_cls=StatefulDistributedSampler,
     )
+
+    eval_dataloader = None
+    if args.eval_dataset:
+        eval_dataset = load_tokenized_dataset(dataset_paths=args.eval_dataset, mode="dev")
+        eval_data_collator = DataCollatorForPreferenceDataset(tokenizer=tokenizer, max_length=args.max_length)
+        eval_dataloader = plugin.prepare_dataloader(
+            dataset=eval_dataset,
+            batch_size=args.batch_size,
+            shuffle=True,
+            drop_last=True,
+            collate_fn=eval_data_collator,
+            distributed_sampler_cls=StatefulDistributedSampler,
+        )
+    else:
+        logger.warning("No evaluation dataset is provided, skip evaluation")
+
     num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps
     math.ceil(args.max_epochs * num_update_steps_per_epoch)
 
@@ -253,21 +275,21 @@ def train(args):
 
     trainer.fit(
         train_preference_dataloader=train_dataloader,
-        eval_preference_dataloader=None,
+        eval_preference_dataloader=eval_dataloader,
         log_dir=args.log_dir,
         use_wandb=args.use_wandb,
     )
 
-    if args.lora_rank > 0 and args.merge_lora_weights:
-        from coati.models.lora import LORA_MANAGER
-
+    if lora_config is not None and lora_config.r > 0:
         # NOTE: set model to eval to merge LoRA weights
-        LORA_MANAGER.merge_weights = True
         model.eval()
     # save model checkpoint after fitting on only rank0
-    coordinator.print_on_master("Start saving final model checkpoint")
-    booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True)
-    coordinator.print_on_master(f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_dir}")
+    if args.save_dir is not None:
+        coordinator.print_on_master("Start saving final model checkpoint")
+        booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True)
+        coordinator.print_on_master(
+            f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_dir}"
+        )
 
     coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
 
@@ -297,33 +319,28 @@ def train(args):
     parser.add_argument("--pretrain", type=str, default=None)
     parser.add_argument("--tokenizer_dir", type=str, default=None)
     parser.add_argument("--dataset", nargs="+", default=[])
+    parser.add_argument("--eval_dataset", nargs="+", default=[])
     parser.add_argument(
         "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint"
     )
-    parser.add_argument("--config_file", type=str, default="config_file", help="Config file")
-    parser.add_argument("--save_dir", type=str, default="output")
+    parser.add_argument("--config_file", type=str, default=None, help="Config file")
+    parser.add_argument("--save_dir", type=str, default=None)
     parser.add_argument("--max_length", type=int, default=2048, help="Model max length")
     parser.add_argument("--max_epochs", type=int, default=3)
     parser.add_argument("--batch_size", type=int, default=4)
     parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision")
     parser.add_argument("--loss_fn", type=str, default="log_sig", choices=["log_sig", "log_exp"], help="Loss function")
-    parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
-    parser.add_argument(
-        "--lora_train_bias",
-        type=str,
-        default="none",
-        help="'none' means it doesn't train biases. 'all' means it trains all biases. 'lora_only' means it only trains biases of LoRA layers",
-    )
+    parser.add_argument("--lora_config", type=str, default=None, help="low-rank adaptation config file path")
     parser.add_argument("--save_interval", type=int, default=1000, help="number of step between two checkpoints")
-    parser.add_argument("--merge_lora_weights", type=bool, default=True)
     parser.add_argument("--lr", type=float, default=5e-6)
     parser.add_argument("--accumulation_steps", type=int, default=8)
-    parser.add_argument("--log_dir", default="logs", type=str)
+    parser.add_argument("--log_dir", default=None, type=str)
     parser.add_argument("--use_wandb", default=False, action="store_true")
     parser.add_argument("--grad_checkpoint", default=False, action="store_true")
     parser.add_argument("--use_flash_attn", default=False, action="store_true")
     args = parser.parse_args()
-    os.makedirs(os.path.dirname(args.config_file), exist_ok=True)
-    with open(args.config_file, "w") as f:
-        json.dump(args.__dict__, f, indent=4)
+    if args.config_file is not None:
+        os.makedirs(os.path.dirname(args.config_file), exist_ok=True)
+        with open(args.config_file, "w") as f:
+            json.dump(args.__dict__, f, indent=4)
     train(args)
diff --git a/applications/ColossalChat/examples/training_scripts/train_rm.sh b/applications/ColossalChat/examples/training_scripts/train_rm.sh
index e06d9092fe4c..274417c03fc2 100755
--- a/applications/ColossalChat/examples/training_scripts/train_rm.sh
+++ b/applications/ColossalChat/examples/training_scripts/train_rm.sh
@@ -15,10 +15,10 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
 }
 set_n_least_used_CUDA_VISIBLE_DEVICES 8
 
-PROJECT_NAME="rm"
+PROJECT_NAME="RM"
 PARENT_SAVE_DIR="" # Path to a folder to save checkpoints
-PARENT_TENSORBOARD_DIR="" # Path to a folder to save logs
 PARENT_CONFIG_FILE="" # Path to a folder to save training config logs
+PARENT_LOG_DIR="" # Path to a folder to save training config logs
 PRETRAINED_MODEL_PATH="" # huggingface or local model path
 PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path
 
@@ -38,17 +38,18 @@ declare -a dataset=(
 TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
 FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
 SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
-CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
+CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json"
+LOG_DIR="${PARENT_LOG_DIR}${FULL_PROJECT_NAME}"
 
 colossalai run --nproc_per_node 8 --hostfile hostfile --master_port 31312 train_rm.py \
     --pretrain $PRETRAINED_MODEL_PATH \
-    --checkpoint_path /home/yeanbang/data/experiments/rm/hhh_aligh/ckptllama2-rm-2024-01-17-14-43-24/epoch-1_step-1317/modeling \
     --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
     --dataset ${dataset[@]} \
     --plugin "zero2" \
     --save_interval 1000 \
     --save_dir $SAVE_DIR \
     --config_file $CONFIG_FILE \
+    --log_dir $LOG_DIR \
     --max_epochs 3 \
     --accumulation_steps 1 \
     --batch_size 8 \
diff --git a/applications/ColossalChat/examples/training_scripts/train_sft.py b/applications/ColossalChat/examples/training_scripts/train_sft.py
index 08e7550df157..6007a8599277 100755
--- a/applications/ColossalChat/examples/training_scripts/train_sft.py
+++ b/applications/ColossalChat/examples/training_scripts/train_sft.py
@@ -7,7 +7,7 @@
 
 import torch
 from coati.dataset import DataCollatorForSupervisedDataset, StatefulDistributedSampler, load_tokenized_dataset
-from coati.models import convert_to_lora_module
+from coati.models import LoraConfig, convert_to_lora_module
 from coati.trainer import SFTTrainer
 from coati.utils import load_checkpoint
 from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -24,8 +24,11 @@
 
 
 def train(args):
+    lora_config = None
+    if args.lora_config is not None:
+        lora_config = LoraConfig.from_file(args.lora_config)
     # check lora compatibility
-    if "gemini" in args.plugin and args.lora_rank > 0:
+    if "gemini" in args.plugin and lora_config is not None and lora_config.r > 0:
         raise ValueError("LoRA is not supported in GeminiPlugin. Please use other plugin")
     if args.plugin == "gemini_auto" and args.accumulation_steps > 1:
         raise ValueError("Gradient accumulation is not supported in GeminiPlugin. Please use other plugin")
@@ -53,15 +56,19 @@ def train(args):
                 torch_dtype=torch.bfloat16 if args.mixed_precision == "bf16" else torch.float16,
                 trust_remote_code=True,
             )
-    if args.lora_rank > 0:
-        model = convert_to_lora_module(model, args.lora_rank, lora_train_bias=args.lora_train_bias)
+
+    if lora_config is not None:
+        model = convert_to_lora_module(model, lora_config=lora_config)
+        for name, module in model.named_modules():
+            if "norm" in name or "gate" in name:
+                module = module.to(torch.float32)
 
     if args.plugin == "ddp":
         """
         Default torch ddp plugin without any acceleration, for
         debugging purpose acceleration, for debugging purpose
         """
-        plugin = TorchDDPPlugin(find_unused_parameters=True)
+        plugin = TorchDDPPlugin(find_unused_parameters=True if args.grad_checkpoint is False else False)
     elif args.plugin == "gemini":
         plugin = GeminiPlugin(
             precision=args.mixed_precision,
@@ -114,6 +121,15 @@ def train(args):
 
     booster = Booster(plugin=plugin)
 
+    # configure optimizer
+    optim = HybridAdam(
+        model_params=model.parameters(),
+        lr=args.lr,
+        betas=(0.9, 0.95),
+        weight_decay=args.weight_decay,
+        adamw_mode=True,
+    )
+
     # ======================================================
     # Initialize Model, Objective, Optimizer and LR Scheduler
     # ======================================================
@@ -122,12 +138,10 @@ def train(args):
     #     LazyInitContext(default_device=get_current_device()) if isinstance(plugin, (GeminiPlugin,)) else nullcontext()
     # )
 
-    if args.grad_checkpoint and args.lora_rank == 0:
-        # lora layers are not supported by gradient checkpointing
-        model.gradient_checkpointing_enable()
+    if args.grad_checkpoint:
+        # Note, for some models, lora may not be compatible with gradient checkpointing
+        model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
         coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
-    elif args.lora_rank > 0:
-        coordinator.print_on_master(msg="Gradient checkpointing will be disabled when LoRA is enabled")
 
     # configure tokenizer
     tokenizer = AutoTokenizer.from_pretrained(
@@ -151,15 +165,6 @@ def train(args):
     coordinator.print_on_master(f"Configuration file will be saved at: {args.config_file}")
     coordinator.print_on_master(f"Model checkpoint will be saved at: {args.save_path}")
 
-    # configure optimizer
-    optim = HybridAdam(
-        model_params=model.parameters(),
-        lr=args.lr,
-        betas=(0.9, 0.95),
-        weight_decay=args.weight_decay,
-        adamw_mode=True,
-    )
-
     # configure dataset
     coordinator.print_on_master(
         f"Max CUDA memory before data loader: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
@@ -175,6 +180,23 @@ def train(args):
         collate_fn=data_collator,
         distributed_sampler_cls=StatefulDistributedSampler,
     )
+
+    eval_dataloader = None
+    if args.eval_dataset:
+        eval_dataset = load_tokenized_dataset(dataset_paths=args.eval_dataset, mode="dev")
+        eval_data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer, max_length=args.max_len)
+
+        eval_dataloader = plugin.prepare_dataloader(
+            dataset=eval_dataset,
+            batch_size=args.batch_size,
+            shuffle=True,
+            drop_last=True,
+            collate_fn=eval_data_collator,
+            distributed_sampler_cls=StatefulDistributedSampler,
+        )
+    else:
+        logger.warning("No evaluation dataset is provided, skip evaluation")
+
     coordinator.print_on_master(
         f"Max CUDA memory after data loader: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB"
     )
@@ -202,6 +224,7 @@ def train(args):
         lr_scheduler=lr_scheduler,
         dataloader=train_dataloader,
     )
+
     torch.set_default_dtype(torch.float)
 
     coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
@@ -257,22 +280,21 @@ def train(args):
 
     trainer.fit(
         train_dataloader=train_dataloader,
-        eval_dataloader=None,
+        eval_dataloader=eval_dataloader,
         log_dir=args.log_dir,
         use_wandb=args.use_wandb,
     )
 
-    if args.lora_rank > 0 and args.merge_lora_weights:
-        from coati.models.lora import LORA_MANAGER
-
+    if lora_config is not None and lora_config.r > 0:
         # NOTE: set model to eval to merge LoRA weights
-        LORA_MANAGER.merge_weights = True
         model.eval()
     # save model checkpoint after fitting on only rank0
-    coordinator.print_on_master("Start saving final model checkpoint")
-
-    # booster.save_model(model, os.path.join(args.save_path, "modeling"), shard=True)
-    coordinator.print_on_master(f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_path}")
+    if args.save_path is not None:
+        coordinator.print_on_master("Start saving final model checkpoint")
+        booster.save_model(model, os.path.join(args.save_path, "modeling"), shard=True)
+        coordinator.print_on_master(
+            f"Saved final model checkpoint at epoch {args.max_epochs} at folder {args.save_path}"
+        )
 
     coordinator.print_on_master(f"Max CUDA memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f} MB")
 
@@ -302,32 +324,27 @@ def train(args):
     parser.add_argument("--pretrain", type=str, default=None)
     parser.add_argument("--tokenizer_dir", type=str, default=None)
     parser.add_argument("--dataset", nargs="+", default=[])
+    parser.add_argument("--eval_dataset", nargs="+", default=[])
     parser.add_argument(
         "--checkpoint_path", type=str, default=None, help="Checkpoint path if need to resume training form a checkpoint"
     )
-    parser.add_argument("--save_path", type=str, default="output")
+    parser.add_argument("--save_path", type=str, default=None)
     parser.add_argument("--max_epochs", type=int, default=3)
     parser.add_argument("--batch_size", type=int, default=4)
     parser.add_argument("--max_len", type=int, default=512)
     parser.add_argument("--mixed_precision", type=str, default="bf16", choices=["fp16", "bf16"], help="Mixed precision")
-    parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
-    parser.add_argument(
-        "--lora_train_bias",
-        type=str,
-        default="none",
-        help="'none' means it doesn't train biases. 'all' means it trains all biases. 'lora_only' means it only trains biases of LoRA layers",
-    )
+    parser.add_argument("--lora_config", type=str, default=None, help="low-rank adaptation config file path")
     parser.add_argument("--save_interval", type=int, default=1000, help="number of step between two checkpoints")
-    parser.add_argument("--merge_lora_weights", type=bool, default=True)
     parser.add_argument("--lr", type=float, default=5e-6)
-    parser.add_argument("--config_file", type=str, default="config_file", help="Config file")
+    parser.add_argument("--config_file", type=str, default=None, help="Config file")
     parser.add_argument("--accumulation_steps", type=int, default=8)
-    parser.add_argument("--log_dir", default="logs", type=str)
+    parser.add_argument("--log_dir", default=None, type=str)
     parser.add_argument("--use_wandb", default=False, action="store_true")
     parser.add_argument("--grad_checkpoint", default=False, action="store_true")
     parser.add_argument("--use_flash_attn", default=False, action="store_true")
     args = parser.parse_args()
-    os.makedirs(os.path.dirname(args.config_file), exist_ok=True)
-    with open(args.config_file, "w") as f:
-        json.dump(args.__dict__, f, indent=4)
+    if args.config_file is not None:
+        os.makedirs(os.path.dirname(args.config_file), exist_ok=True)
+        with open(args.config_file, "w") as f:
+            json.dump(args.__dict__, f, indent=4)
     train(args)
diff --git a/applications/ColossalChat/examples/training_scripts/train_sft.sh b/applications/ColossalChat/examples/training_scripts/train_sft.sh
index 53c7129013db..e87184c812db 100755
--- a/applications/ColossalChat/examples/training_scripts/train_sft.sh
+++ b/applications/ColossalChat/examples/training_scripts/train_sft.sh
@@ -13,13 +13,11 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
     echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
 }
 
-
-# export CUDA_VISIBLE_DEVICES=4,5,6
-set_n_least_used_CUDA_VISIBLE_DEVICES 2
-PROJECT_NAME="sft"
+set_n_least_used_CUDA_VISIBLE_DEVICES 4
+PROJECT_NAME="SFT"
 PARENT_SAVE_DIR="" # Path to a folder to save checkpoints
-PARENT_TENSORBOARD_DIR="" # Path to a folder to save logs
 PARENT_CONFIG_FILE="" # Path to a folder to save training config logs
+PARENT_LOG_DIR="" # Path to a folder to save training config logs
 PRETRAINED_MODEL_PATH="" # huggingface or local model path
 PRETRAINED_TOKENIZER_PATH="" # huggingface or local tokenizer path
 declare -a dataset=(
@@ -38,28 +36,25 @@ declare -a dataset=(
 TIMESTAMP=$(date +%Y-%m-%d-%H-%M-%S)
 FULL_PROJECT_NAME="${PROJECT_NAME}-${TIMESTAMP}"
 SAVE_DIR="${PARENT_SAVE_DIR}${FULL_PROJECT_NAME}"
-CONFIG_FILE="${PARENT_CONFIG_FILE}-${FULL_PROJECT_NAME}.json"
+CONFIG_FILE="${PARENT_CONFIG_FILE}${FULL_PROJECT_NAME}.json"
+LOG_DIR="${PARENT_LOG_DIR}${FULL_PROJECT_NAME}"
 
 echo $(which colossalai)
 echo $(which python)
 # the real batch size for gradient descent is number_of_node_in_hostfile * nproc_per_node * train_batch_size
-colossalai run --nproc_per_node 2 --master_port 31312 --hostfile ./hostfile train_sft.py \
+colossalai run --nproc_per_node 4 --master_port 31312 --hostfile ./hostfile train_sft.py \
     --pretrain $PRETRAINED_MODEL_PATH \
     --tokenizer_dir $PRETRAINED_TOKENIZER_PATH \
-    --save_interval 4000 \
+    --save_interval 2000 \
     --dataset ${dataset[@]} \
-    --save_path $SAVE_DIR \
-    --config_file $CONFIG_FILE \
-    --lora_rank 0 \
-    --plugin 3d \
-    --tp 2 \
-    --pp 1 \
-    --zero_stage 0 \
-    --batch_size 2 \
-    --max_epochs 3 \
+    --plugin zero2 \
+    --batch_size 8 \
+    --max_epochs 1 \
     --accumulation_steps 1 \
     --lr 5e-5 \
-    --max_len 400 \
+    --max_len 4096 \
+    --use_flash_attn \
     --grad_checkpoint \
-    --use_wandb \
-    --use_flash_attn
+    --save_path $SAVE_DIR \
+    --config_file $CONFIG_FILE \
+    --log_dir $LOG_DIR \
diff --git a/applications/ColossalChat/requirements.txt b/applications/ColossalChat/requirements.txt
index ef3a5a0e8420..2188de12f2be 100755
--- a/applications/ColossalChat/requirements.txt
+++ b/applications/ColossalChat/requirements.txt
@@ -1,9 +1,9 @@
-transformers>=4.36.2
+transformers==4.39.3
 tqdm
 datasets==2.14.7
 loralib
-colossalai>=0.3.7
-torch>=1.12.1
+colossalai==0.4.0
+torch>=2.1.0
 langchain
 tokenizers
 fastapi
diff --git a/applications/ColossalChat/tests/generate_dummy_datasets_for_testing.py b/applications/ColossalChat/tests/generate_dummy_datasets_for_testing.py
index 9f85b4beb65d..e50b20b6b212 100644
--- a/applications/ColossalChat/tests/generate_dummy_datasets_for_testing.py
+++ b/applications/ColossalChat/tests/generate_dummy_datasets_for_testing.py
@@ -4,7 +4,7 @@
 
 sft_seed = {
     "messages": [
-        {"from": "human", "content": "Give three tips for staying healthy."},
+        {"from": "user", "content": "Give three tips for staying healthy."},
         {
             "from": "assistant",
             "content": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.",
@@ -13,7 +13,7 @@
 }
 prompt_seed = {
     "messages": [
-        {"from": "human", "content": "Describe the impacts of climate change on communities living in coastal areas."},
+        {"from": "user", "content": "Describe the impacts of climate change on communities living in coastal areas."},
         {
             "from": "assistant",
             "content": "Climate change has caused an increase in sea levels, which has caused coastal erosion and flooding of low-lying areas. This has led to displacement of people from their homes, as well as increased risk of epidemics of waterborne illnesses. Coastal cities have also seen an increase in extreme weather events such as hurricanes and tropical storms, which can cause extensive damage to infrastructure, homes, and businesses. As a result of climate change, some coastal areas are becoming uninhabitable, forcing communities to seek alternative living arrangements.",
@@ -22,21 +22,34 @@
 }
 preference_seed = {
     "context": [
-        {"from": "human", "content": "What kind of noises did dinosaurs make?"},
+        {"from": "user", "content": "What kind of noises did dinosaurs make?"},
         {
             "from": "assistant",
             "content": "Humans and dinosaurs didn't live at the same time, so it's really hard to say. The best place to find out what noises dinosaurs made would be",
         },
-        {"from": "human", "content": "yes they did"},
+        {"from": "user", "content": "yes they did"},
         {
             "from": "assistant",
             "content": "to guess, and that would probably require lots of reading and a certain amount of imagination, so we're not really prepared to do that.",
         },
-        {"from": "human", "content": "you cant read"},
+        {"from": "user", "content": "you cant read"},
     ],
     "chosen": [{"from": "assistant", "content": "You can read?"}],
     "rejected": [{"from": "assistant", "content": "there's a lot of stuff humans don't know"}],
 }
+kto_seed = {
+    "prompt": [
+        {"from": "user", "content": "What are some praise words in english?"},
+        {
+            "from": "assistant",
+            "content": "Here's an incomplete list.\n\nexcellent, fantastic, impressive  ...",
+        },
+        {"from": "user", "content": "What's your favorite one?"},
+    ],
+    "completion": {"from": "assistant", "content": "Impressive."},
+    "label": True,
+}
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -61,12 +74,21 @@
         seed = prompt_seed
     elif args.data_type == "preference":
         seed = preference_seed
+    elif args.data_type == "kto":
+        seed = kto_seed
     else:
         raise ValueError(f"Unknown data type {args.data_type}")
-
-    line = json.dumps(seed, ensure_ascii=False) + "\n"
-    for idx in [1, 2, 3]:
-        with open(os.path.join(args.data_dir, f"{idx}.jsonl"), "w", encoding="utf8") as f:
-            for i in range(1000):
+    if args.data_type != "kto":
+        line = json.dumps(seed, ensure_ascii=False) + "\n"
+        for idx in [1, 2, 3]:
+            with open(os.path.join(args.data_dir, f"{idx}.jsonl"), "w", encoding="utf8") as f:
+                for i in range(1000):
+                    f.write(line)
                 f.write(line)
-            f.write(line)
+    else:
+        for idx in [1, 2, 3]:
+            with open(os.path.join(args.data_dir, f"{idx}.jsonl"), "w", encoding="utf8") as f:
+                for i in range(1000):
+                    seed["label"] = not seed["label"]
+                    line = json.dumps(seed, ensure_ascii=False) + "\n"
+                    f.write(line)
diff --git a/applications/ColossalChat/tests/test_data/dpo/test_dpo_data.jsonl b/applications/ColossalChat/tests/test_data/dpo/test_dpo_data.jsonl
index 2e11a91c643f..0f9a02ea333c 100644
--- a/applications/ColossalChat/tests/test_data/dpo/test_dpo_data.jsonl
+++ b/applications/ColossalChat/tests/test_data/dpo/test_dpo_data.jsonl
@@ -1 +1 @@
-{"context": [{"from": "human", "content": "If I were to give you some coordinates, could you tell me where the location is?"}], "chosen": [{"from": "assistant", "content": "Certainly!  I can tell you if the given coordinates are within any geographic location, such as a country or city.  However, I cannot give accurate directions to any specific location without more information.  Are you looking for a more general geographic location, or do you want more precise directions?"}], "rejected": [{"from": "assistant", "content": "Yes, I can determine the location and give you some details about the area.  Please provide the latitude and longitude coordinates, and I will return a description of the location based on these coordinates.  Please note that precise geographical coordinates are often imprecise, so please provide the best possible coordinates you have available.  Here are some examples:\n-The coordinates 41.543554,-71.09471 locate the Boston Commons in Boston, Massachusetts\n-The coordinates 37.42242, -122.083733 locate the Palace of Fine Arts in San Francisco, California \n-The coordinates 25.774145, -80.190262 locate Canal Street in Miami Beach, Florida\n-The coordinates 35. journalisten nl 48.056944, -106.277778 locate the Wyoming State Penitentiary in Rawlins, Wyoming\n-The coordinates 51.507351,-0.127758 locate Tower Bridge in London, England\n-The coordinates 44.885618, -79.422594 locate the Stratosphere Casino, Hotel and Tower in Las Vegas, Nevada"}]}
+{"context": [{"from": "user", "content": "If I were to giv6rkkurjetes, could you tell me where the location is?"}, {"from": "assistant", "content": "If I were to give you some coordinates, could you tell me where the location is?"}, {"from": "user", "content": "If I were to give you 76i85l5e4henates, could you tell me where the location is?"}], "chosen": [{"from": "assistant", "content": "Certainly!  I can tell you if the given coordinatestw434yy45wn any geographic location, such as a country or city.  However, I cannot give accurate directions to any specific location without more information.  Are you looking for a more general geographic location, or do you want more precise directions?"}], "rejected": [{"from": "assistant", "content": "Yes, I can determine the location and give45wywydfgw46 about the area.  Please provide the latitude and longitude coordinates, and I will return a description of the location based on these coordinates.  Please note that precise geographical coordinates are often imprecise, so please provide the best possible coordinates you have available.  Here are some examples:\n-The coordinates 41.543554,-71.09471 locate the Boston Commons in Boston, Massachusetts\n-The coordinates 37.42242, -122.083733 locate the Palace of Fine Arts in San Francisco, California \n-The coordinates 25.774145, -80.190262 locate Canal Street in Miami Beach, Florida\n-The coordinates 35. journalisten nl 48.056944, -106.277778 locate the Wyoming State Penitentiary in Rawlins, Wyoming\n-The coordinates 51.507351,-0.127758 locate Tower Bridge in London, England\n-The coordinates 44.885618, -79.422594 locate the Stratosphere Casino, Hotel and Tower in Las Vegas, Nevada"}]}
diff --git a/applications/ColossalChat/tests/test_data/kto/test_kto_data.jsonl b/applications/ColossalChat/tests/test_data/kto/test_kto_data.jsonl
new file mode 100644
index 000000000000..4f4fce83da2b
--- /dev/null
+++ b/applications/ColossalChat/tests/test_data/kto/test_kto_data.jsonl
@@ -0,0 +1 @@
+{"prompt": [{"from": "user", "content": "What are some praise words in english?"}, {"from": "assistant", "content": "Here's an incomplete list.\n\nexcellent, fantastic, impressive  ..."},{"from": "user", "content": "What's your favorite one?"}],"completion": {"from": "assistant", "content": "impressive."},"label": true}
diff --git a/applications/ColossalChat/tests/test_data/sft/test_sft_data.jsonl b/applications/ColossalChat/tests/test_data/sft/test_sft_data.jsonl
index 21c4d9dc76ec..759bba7a053c 100644
--- a/applications/ColossalChat/tests/test_data/sft/test_sft_data.jsonl
+++ b/applications/ColossalChat/tests/test_data/sft/test_sft_data.jsonl
@@ -1 +1 @@
-{"messages": [{"from": "human", "content": "what are some pranks with a pen I can do?"}, {"from": "assistant", "content": "Are you looking for practical joke ideas?"}, {"from": "human", "content": "Yes, exactly!"}, {"from": "assistant", "content": "You could try attaching a small buzzer to a pen. When someone tries to use it, they'll be surprised by the sound!"}, {"from": "human", "content": "That's a funny one! Any other ideas?"}, {"from": "assistant", "content": "Another idea is the classic 'ink explosion' prank. Replace the ink of a pen with disappearing ink, and watch the confusion when the ink vanishes from the paper."}]}
+{"messages": [{"from": "user", "content": "what are some pranks with a pen I can do?"}, {"from": "assistant", "content": "Are you looking for practical joke ideas?"}, {"from": "user", "content": "Yes, exactly!"}, {"from": "assistant", "content": "You could try attaching a small buzzer to a pen. When someone tries to use it, they'll be surprised by the sound!"}, {"from": "user", "content": "That's a funny one! Any other ideas?"}, {"from": "assistant", "content": "Another idea is the classic 'ink explosion' prank. Replace the ink of a pen with disappearing ink, and watch the confusion when the ink vanishes from the paper."}]}
diff --git a/applications/ColossalChat/tests/test_data_preparation.sh b/applications/ColossalChat/tests/test_data_preparation.sh
index a7689cdc6688..427c3952b0d4 100755
--- a/applications/ColossalChat/tests/test_data_preparation.sh
+++ b/applications/ColossalChat/tests/test_data_preparation.sh
@@ -71,6 +71,8 @@ get_data_input_dirs() {
         echo "$PROMPT_DATASET"
     elif [[ $data_type == "preference" ]]; then
         echo "$PREFERENCE_DATASET"
+    elif [[ $data_type == "kto" ]]; then
+        echo "$KTO_DATASET"
     else
         echo "Unknown data type $data_type"
         exit 1
@@ -121,6 +123,10 @@ python $TEST_DIR/generate_dummy_datasets_for_testing.py \
     --data_dir $(get_data_input_dirs prompt) \
     --data_type "prompt"
 
+python $TEST_DIR/generate_dummy_datasets_for_testing.py \
+    --data_dir $(get_data_input_dirs kto) \
+    --data_type "kto"
+
 echo "[Test]: testing prepare_preference_dataset.py ..."
 
 # FIXME: This is a hack to skip tests that are not working
@@ -258,3 +264,50 @@ for model in ${MODELS[@]}; do
         exit 1
     fi
 done
+
+
+echo "[Test]: testing prepare_kto_dataset.py ..."
+
+# FIXME: This is a hack to skip tests that are not working
+SKIPPED_TESTS=(
+)
+
+# test prepare_kto_dataset
+for model in ${MODELS[@]}; do
+    data_type="kto"
+    if [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$data_type " ]]; then
+        echo "[Test]: Skipped $model-$data_type"
+        continue
+    fi
+    cache_dir=$DATA_SAVE_PATH/tokenized_${model}_${data_type}/cache
+    jsonl_dir=$DATA_SAVE_PATH/tokenized_${model}_${data_type}/jsonl
+    arrow_dir=$DATA_SAVE_PATH/tokenized_${model}_${data_type}/arrow
+    data_input_dirs=$(get_data_input_dirs $data_type)
+    tokenizer_dir=$(get_tokenizer_dirs $model)
+    conversation_template=$(get_conversation_template_config $model)
+    for i in $(seq $NUM_RETRY); do
+        rm -rf $cache_dir
+        rm -rf $jsonl_dir
+        rm -rf $arrow_dir
+        echo "[Test]: $model-$data_type, attempt $i"
+        python $EXAMPLES_DIR/data_preparation_scripts/prepare_dataset.py \
+            --type kto \
+            --data_input_dirs $data_input_dirs \
+            --conversation_template_config $conversation_template \
+            --tokenizer_dir $tokenizer_dir \
+            --data_cache_dir $cache_dir \
+            --data_jsonl_output_dir $jsonl_dir \
+            --data_arrow_output_dir $arrow_dir \
+            --max_length 400 \
+            --num_samples_per_datafile 100 \
+            --num_spliced_dataset_bins 1
+        passed=$?
+        if [ $passed -eq 0 ]; then
+            break
+        fi
+    done
+    if [ $passed -ne 0 ]; then
+        echo "[Test]: Failed $model-$data_type"
+        exit 1
+    fi
+done
diff --git a/applications/ColossalChat/tests/test_lora.py b/applications/ColossalChat/tests/test_lora.py
index 4ea9e1a15c59..7787592105b6 100755
--- a/applications/ColossalChat/tests/test_lora.py
+++ b/applications/ColossalChat/tests/test_lora.py
@@ -2,6 +2,7 @@
 import torch.nn as nn
 import torch.optim as optim
 from coati.models import convert_to_lora_module
+from coati.models.lora import LoraConfig, LoraEmbedding, LoraLinear
 from torch.utils.data import DataLoader, TensorDataset
 
 
@@ -38,7 +39,7 @@ def test_overfit():
     # Build and convert model
     model = SimpleNN(input_size, hidden_size, num_classes)
     weight_to_compare = model.fc1.weight.detach().clone()
-    model = convert_to_lora_module(model, lora_rank=30)
+    model = convert_to_lora_module(model, lora_config=LoraConfig(r=32))
 
     # Loss and optimizer
     criterion = nn.CrossEntropyLoss()
@@ -50,7 +51,6 @@ def test_overfit():
             # Forward pass
             outputs = model(inputs)
             loss = criterion(outputs, labels)
-            print(loss)
             # Backward and optimize
             optimizer.zero_grad()
             loss.backward()
@@ -65,5 +65,50 @@ def test_overfit():
     assert (weight_to_compare - model.fc1.weight).sum() < 0.01
 
 
+def test_lora_linear_accuracy():
+
+    weight = torch.randn(10, 5)
+    linear = nn.Linear(5, 10)
+    linear.weight.data = weight
+    x = torch.randn(10, 5)
+    out_linear = linear(x)
+
+    # lora linear Pissa
+    linear.weight.data = weight
+    lora_linear = LoraLinear(linear.weight, linear.bias, r=2, lora_initialization_method="PiSSA")
+    out_lora = lora_linear(x)
+    assert torch.allclose(out_linear, out_lora, atol=1e-5, rtol=1e-05)
+
+    # lora linear
+    linear.weight.data = weight
+    lora_linear = LoraLinear(linear.weight, linear.bias, r=2)
+    out_lora = lora_linear(x)
+    assert torch.allclose(out_linear, out_lora, atol=1e-5, rtol=1e-05)
+
+
+def test_lora_embedding_accuracy():
+    weight = torch.randn(10, 5)
+    embedding = nn.Embedding(10, 5)
+    embedding.weight.data = weight
+    x = torch.randint(0, 10, (10,))
+    out_embedding = embedding(x)
+
+    # lora embedding Pissa
+    embedding.weight.data = weight
+    lora_embedding = LoraEmbedding(
+        embedding.weight, r=2, lora_initialization_method="PiSSA", num_embeddings=10, embedding_dim=5
+    )
+    out_lora = lora_embedding(x)
+    assert torch.allclose(out_embedding, out_lora, atol=1e-5, rtol=1e-05)
+
+    # lora embedding
+    embedding.weight.data = weight
+    lora_embedding = LoraEmbedding(embedding.weight, r=2, num_embeddings=10, embedding_dim=5)
+    out_lora = lora_embedding(x)
+    assert torch.allclose(out_embedding, out_lora, atol=1e-5, rtol=1e-05)
+
+
 if __name__ == "__main__":
     test_overfit()
+    test_lora_linear_accuracy()
+    test_lora_embedding_accuracy()
diff --git a/applications/ColossalChat/tests/test_templating.sh b/applications/ColossalChat/tests/test_templating.sh
index d033c07f5fa4..6ee10e8bed87 100755
--- a/applications/ColossalChat/tests/test_templating.sh
+++ b/applications/ColossalChat/tests/test_templating.sh
@@ -94,7 +94,7 @@ done
 
 # Test DPO/PPO data Preparation
 for model in ${MODELS[@]}; do
-    echo "Testing DPO/PPO data templating for $model"
+    echo "Testing DPO/RM data templating for $model"
     SAVE_DIR=$DATA_SAVE_PATH/dpo/$model
     rm -rf $SAVE_DIR/cache
     rm -rf $SAVE_DIR/jsonl
@@ -109,14 +109,44 @@ for model in ${MODELS[@]}; do
         --data_arrow_output_dir $SAVE_DIR/arrow
     passed=$?
     if [ $passed -ne 0 ]; then
-        echo "[Test]: Failed in the DPO data templating for $model"
+        echo "[Test]: Failed in the DPO/RM data templating for $model"
         exit 1
     fi
     python $BASE_DIR/tests/verify_chat_data.py --data_source $TEST_DATA_DIR/dpo/test_dpo_data.jsonl \
         --to_verify_file $SAVE_DIR/jsonl/part-00005.jsonl --data_type dpo
     passed=$?
     if [ $passed -ne 0 ]; then
-        echo "[Test]: Failed in the DPO data templating test for $model"
+        echo "[Test]: Failed in the DPO/RM data templating test for $model"
+        exit 1
+    fi
+done
+
+
+# Test KTO data Preparation
+for model in ${MODELS[@]}; do
+    echo "Testing KTO data templating for $model"
+    SAVE_DIR=$DATA_SAVE_PATH/kto/$model
+    rm -rf $SAVE_DIR/cache
+    rm -rf $SAVE_DIR/jsonl
+    rm -rf $SAVE_DIR/arrow
+    pretrain=$(get_pretrain $model)
+    conversation_template_config=$(get_conversation_template_config $model)
+    python $EXAMPLES_DIR/data_preparation_scripts/prepare_dataset.py --type kto --data_input_dirs $TEST_DATA_DIR/kto \
+        --tokenizer_dir  $pretrain \
+        --conversation_template_config $conversation_template_config \
+        --data_cache_dir $SAVE_DIR/cache \
+        --data_jsonl_output_dir $SAVE_DIR/jsonl \
+        --data_arrow_output_dir $SAVE_DIR/arrow
+    passed=$?
+    if [ $passed -ne 0 ]; then
+        echo "[Test]: Failed in the KTO data templating for $model"
+        exit 1
+    fi
+    python $BASE_DIR/tests/verify_chat_data.py --data_source $TEST_DATA_DIR/kto/test_kto_data.jsonl \
+        --to_verify_file $SAVE_DIR/jsonl/part-00005.jsonl --data_type kto
+    passed=$?
+    if [ $passed -ne 0 ]; then
+        echo "[Test]: Failed in the KTO data templating test for $model"
         exit 1
     fi
 done
diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh
index d1a685174177..c26b25c837e6 100755
--- a/applications/ColossalChat/tests/test_train.sh
+++ b/applications/ColossalChat/tests/test_train.sh
@@ -30,9 +30,10 @@ MODEL_SAVE_PATH=$TEMP_DIR/rlhf_models
 MODELS_DIR=$TEMP_DIR/models_config
 # Skip those tests due to CI tests timeout
 MODELS=('llama')
-ADVANCED_PLUGINS=('sp_split_gather' 'sp_ring' 'sp_all_to_all' 'tp_zero2' '3d' 'gemini' 'gemini_auto' 'zero2' 'zero2_cpu')  # pp is still buggy
-PLUGINS=('3d' 'gemini' 'gemini_auto' 'zero2' 'zero2_cpu')
+ADVANCED_PLUGINS=('zero2' 'sp_split_gather' 'sp_ring' 'sp_all_to_all' 'tp_zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu')  # pp is still buggy
+PLUGINS=('zero2' '3d' 'gemini' 'gemini_auto' 'zero2_cpu')
 LORA_RANK=('0')  # skip to reduce CI execution time, can pass all locally
+LORA_CONFIG_ENABLE="--lora_config $BASE_DIR/examples/training_scripts/lora_config.json"
 
 export OMP_NUM_THREADS=8
 
@@ -112,6 +113,11 @@ for lora_rank in ${LORA_RANK[@]}; do
             sp='1'
             sp_mode='split_gather'
             enable_sequence_parallelism=''
+            if [[ $plugin == "zero2" ]]; then
+                lora_config=$LORA_CONFIG_ENABLE
+            else
+                lora_config=""
+            fi
             if [[ $plugin == "3d" ]]; then
                 tp='4'
                 bs='8'
@@ -173,9 +179,10 @@ for lora_rank in ${LORA_RANK[@]}; do
                     --pretrain $pretrain \
                     --tokenizer_dir $tokenizer_dir \
                     --dataset ${dataset[@]} \
+                    --eval_dataset ${dataset[@]} \
                     --save_path $MODEL_SAVE_PATH \
                     --config_file $MODELS_DIR/config.jsonl \
-                    --lora_rank $lora_rank \
+                    $lora_config \
                     --plugin $plugin \
                     --batch_size $bs \
                     --max_epochs 1 \
@@ -192,8 +199,8 @@ for lora_rank in ${LORA_RANK[@]}; do
                     --use_flash_attn
                 passed=$?
                 if [ $passed -eq 0 ]; then
-                    rm -rf $MODEL_SAVE_PATH/*
-                    rm -rf $MODELS_DIR/*
+                    rm -rf ${MODEL_SAVE_PATH:?}/*
+                    rm -rf ${MODELS_DIR:?}/*
                     break
                 fi
             done
@@ -229,6 +236,11 @@ for lora_rank in ${LORA_RANK[@]}; do
             grad_ckpt=$(random_choice "${GRAD_CKPTS[@]}")
             tp='1'
             bs='2'
+            if [[ $plugin == "zero2" ]]; then
+                lora_config=$LORA_CONFIG_ENABLE
+            else
+                lora_config=""
+            fi
             if [[ $plugin == "3d" ]]; then
                 tp='4'
                 bs='8'
@@ -248,9 +260,10 @@ for lora_rank in ${LORA_RANK[@]}; do
                     --pretrain $pretrain \
                     --tokenizer_dir $tokenizer_dir \
                     --dataset ${dataset[@]} \
+                    --eval_dataset ${dataset[@]} \
                     --save_dir $MODEL_SAVE_PATH \
                     --config_file $MODELS_DIR/config.jsonl \
-                    --lora_rank $lora_rank \
+                    $lora_config \
                     --plugin $plugin \
                     --batch_size $bs \
                     --max_epochs 1 \
@@ -262,8 +275,8 @@ for lora_rank in ${LORA_RANK[@]}; do
                     --use_flash_attn
                 passed=$?
                 if [ $passed -eq 0 ]; then
-                    rm -rf $MODEL_SAVE_PATH/*
-                    rm -rf $MODELS_DIR/*
+                    rm -rf ${MODEL_SAVE_PATH:?}/*
+                    rm -rf ${MODELS_DIR:?}/*
                     break
                 fi
             done
@@ -306,6 +319,11 @@ for lora_rank in ${LORA_RANK[@]}; do
             bs='4'
             ebs='8'
             conversation_template=$(get_conversation_template_config $model)
+            if [[ $plugin == "zero2" ]]; then
+                lora_config=$LORA_CONFIG_ENABLE
+            else
+                lora_config=""
+            fi
             if [[ $plugin == "3d" ]]; then
                 tp='4'
                 bs='16'
@@ -342,7 +360,7 @@ for lora_rank in ${LORA_RANK[@]}; do
                     --ptx_batch_size 1 \
                     --ptx_coef 0.2 \
                     --save_path $MODEL_SAVE_PATH \
-                    --lora_rank $lora_rank \
+                    $lora_config \
                     --plugin $plugin \
                     --num_episodes 5 \
                     --num_collect_steps 1 \
@@ -361,8 +379,8 @@ for lora_rank in ${LORA_RANK[@]}; do
                     # --use_flash_attn
                 passed=$?
                 if [ $passed -eq 0 ]; then
-                    rm -rf $MODEL_SAVE_PATH/*
-                    rm -rf $MODELS_DIR/*
+                    rm -rf ${MODEL_SAVE_PATH:?}/*
+                    rm -rf ${MODELS_DIR:?}/*
                     break
                 fi
             done
@@ -402,6 +420,11 @@ for lora_rank in ${LORA_RANK[@]}; do
                 tp='4'
                 bs='8'
             fi
+            if [[ $plugin == "zero2" ]]; then
+                lora_config=$LORA_CONFIG_ENABLE
+            else
+                lora_config=""
+            fi
             grad_accu='2'
             # gemini_auto and gemini doesn't support gradient accumulation
             if [[ $plugin == "gemini_auto" ]]; then
@@ -423,22 +446,191 @@ for lora_rank in ${LORA_RANK[@]}; do
                     --pretrain $pretrain \
                     --tokenizer_dir $tokenizer_dir \
                     --dataset ${dataset[@]} \
+                    --eval_dataset ${dataset[@]} \
+                    --save_dir $MODEL_SAVE_PATH \
+                    --config_file $MODELS_DIR/config.jsonl \
+                    $lora_config \
+                    --plugin $plugin \
+                    --batch_size $bs \
+                    --max_epochs 1 \
+                    --accumulation_steps $grad_accu \
+                    --tp $tp \
+                    --lr 2e-5 \
+                    $grad_ckpt \
+                    --max_len 400 \
+                    --use_flash_attn
+                passed=$?
+                if [ $passed -eq 0 ]; then
+                    rm -rf ${MODEL_SAVE_PATH:?}/*
+                    rm -rf ${MODELS_DIR:?}/*
+                    break
+                fi
+            done
+            if [ $passed -ne 0 ]; then
+                echo "[Test]: Failed $model-$plugin-$lora_rank"
+                exit 1
+            fi
+        done
+    done
+done
+
+
+
+echo "[Test]: testing ORPO ..."
+
+SKIPPED_TESTS=(
+    llama-3d-20 # 3d plugin doesn't support lora
+    llama-gemini_auto-20  # gemini_auto plugin doesn't support lora
+    llama-gemini-20 # gemini doesn't support lora
+)
+GRAD_CKPTS=('--grad_checkpoint')
+for lora_rank in ${LORA_RANK[@]}; do
+    for model in ${MODELS[@]}; do
+        for plugin in ${PLUGINS[@]}; do
+            if [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$plugin-$lora_rank " ]]; then
+                echo "[Test]: Skipped $model-$plugin-$lora_rank"
+                continue
+            elif [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$plugin " ]]; then
+                echo "[Test]: Skipped $model-$plugin"
+                continue
+            fi
+            pretrain=$(get_pretrain $model)
+            tokenizer_dir=$(get_tokenizer_dirs $model)
+            grad_ckpt=$(random_choice "${GRAD_CKPTS[@]}")
+            tp='1'
+            bs='2'
+            if [[ $plugin == "3d" ]]; then
+                tp='4'
+                bs='8'
+            fi
+            if [[ $plugin == "zero2" ]]; then
+                lora_config=$LORA_CONFIG_ENABLE
+            else
+                lora_config=""
+            fi
+            grad_accu='2'
+            # gemini_auto and gemini doesn't support gradient accumulation
+            if [[ $plugin == "gemini_auto" ]]; then
+                grad_accu='1'
+            fi
+            # gemini_auto doesn't support generation
+            # (need to calculate ref_model logits through forwarding in inference mode)
+            if [[ $plugin == "gemini_auto" ]]; then
+                echo "[Test]: Skipped $model-$plugin"
+                continue
+            fi
+            for i in $(seq $NUM_RETRY); do
+                echo "[Test]: $model-$plugin-$lora_rank, attempt $i"
+                declare -a dataset=()
+                for split in $(seq -f "%05g" 0 0); do
+                    dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_preference/arrow/part-$split")
+                done
+                colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_orpo.py \
+                    --pretrain $pretrain \
+                    --tokenizer_dir $tokenizer_dir \
+                    --dataset ${dataset[@]} \
+                    --eval_dataset ${dataset[@]} \
+                    --save_dir $MODEL_SAVE_PATH \
+                    --config_file $MODELS_DIR/config.jsonl \
+                    $lora_config \
+                    --plugin $plugin \
+                    --batch_size $bs \
+                    --max_epochs 1 \
+                    --accumulation_steps $grad_accu \
+                    --tp $tp \
+                    --lr 2e-5 \
+                    $grad_ckpt \
+                    --max_len 400 \
+                    --use_flash_attn
+                passed=$?
+                if [ $passed -eq 0 ]; then
+                    rm -rf ${MODEL_SAVE_PATH:?}/*
+                    rm -rf ${MODELS_DIR:?}/*
+                    break
+                fi
+            done
+            if [ $passed -ne 0 ]; then
+                echo "[Test]: Failed $model-$plugin-$lora_rank"
+                exit 1
+            fi
+        done
+    done
+done
+
+
+
+echo "[Test]: testing KTO ..."
+
+SKIPPED_TESTS=(
+    llama-3d-20 # 3d plugin doesn't support lora
+    llama-gemini_auto-20  # gemini_auto plugin doesn't support lora
+    llama-gemini-20 # gemini doesn't support lora
+)
+GRAD_CKPTS=('--grad_checkpoint')
+for lora_rank in ${LORA_RANK[@]}; do
+    for model in ${MODELS[@]}; do
+        for plugin in ${PLUGINS[@]}; do
+            if [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$plugin-$lora_rank " ]]; then
+                echo "[Test]: Skipped $model-$plugin-$lora_rank"
+                continue
+            elif [[ " ${SKIPPED_TESTS[*]} " =~ " $model-$plugin " ]]; then
+                echo "[Test]: Skipped $model-$plugin"
+                continue
+            fi
+            pretrain=$(get_pretrain $model)
+            tokenizer_dir=$(get_tokenizer_dirs $model)
+            grad_ckpt=$(random_choice "${GRAD_CKPTS[@]}")
+            tp='1'
+            bs='2'
+            if [[ $plugin == "3d" ]]; then
+                tp='4'
+                bs='8'
+            fi
+            if [[ $plugin == "zero2" ]]; then
+                lora_config=$LORA_CONFIG_ENABLE
+            else
+                lora_config=""
+            fi
+            grad_accu='2'
+            # gemini_auto and gemini doesn't support gradient accumulation
+            if [[ $plugin == "gemini_auto" ]]; then
+                grad_accu='1'
+            fi
+            # gemini_auto doesn't support generation
+            # (need to calculate ref_model logits through forwarding in inference mode)
+            if [[ $plugin == "gemini_auto" ]]; then
+                echo "[Test]: Skipped $model-$plugin"
+                continue
+            fi
+            for i in $(seq $NUM_RETRY); do
+                echo "[Test]: $model-$plugin-$lora_rank, attempt $i"
+                declare -a dataset=()
+                for split in $(seq -f "%05g" 0 0); do
+                    dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_kto/arrow/part-$split")
+                done
+                colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_kto.py \
+                    --pretrain $pretrain \
+                    --tokenizer_dir $tokenizer_dir \
+                    --dataset ${dataset[@]} \
+                    --eval_dataset ${dataset[@]} \
                     --save_dir $MODEL_SAVE_PATH \
                     --config_file $MODELS_DIR/config.jsonl \
-                    --lora_rank $lora_rank \
+                    $lora_config \
                     --plugin $plugin \
                     --batch_size $bs \
                     --max_epochs 1 \
                     --accumulation_steps $grad_accu \
                     --tp $tp \
                     --lr 2e-5 \
+                    --auto_weight \
+                    --desirable_weight 1.2 \
                     $grad_ckpt \
                     --max_len 400 \
                     --use_flash_attn
                 passed=$?
                 if [ $passed -eq 0 ]; then
-                    rm -rf $MODEL_SAVE_PATH/*
-                    rm -rf $MODELS_DIR/*
+                    rm -rf ${MODEL_SAVE_PATH:?}/*
+                    rm -rf ${MODELS_DIR:?}/*
                     break
                 fi
             done
diff --git a/applications/ColossalChat/tests/verify_chat_data.py b/applications/ColossalChat/tests/verify_chat_data.py
index 98ae0c1b2d28..eb8f9ce46075 100644
--- a/applications/ColossalChat/tests/verify_chat_data.py
+++ b/applications/ColossalChat/tests/verify_chat_data.py
@@ -62,3 +62,11 @@
         assert any(
             [rejected_lable in s for s in to_verify_lable_rejected]
         ), f"Rejected label {rejected_lable} not in target rejected label {to_verify_lable_chosen}"
+    elif args.data_type == "kto":
+        sample = data[0]
+        to_verify_data = to_verify_data[0]
+        for line in sample["prompt"]:
+            assert line["content"] in to_verify_data["input_id_decode"]
+        assert sample["completion"]["content"] in to_verify_data["input_id_decode"]
+        assert sample["completion"]["content"] in to_verify_data["completion_decode"]
+        assert sample["label"] == to_verify_data["label"]
diff --git a/applications/ColossalEval/colossal_eval/dataset/agieval.py b/applications/ColossalEval/colossal_eval/dataset/agieval.py
index d5f2302494e8..c1cfe37d7599 100644
--- a/applications/ColossalEval/colossal_eval/dataset/agieval.py
+++ b/applications/ColossalEval/colossal_eval/dataset/agieval.py
@@ -197,9 +197,7 @@ class AGIEvalDataset(BaseDataset):
     """
 
     @staticmethod
-    def load(
-        path: str, logger: DistributedLogger, few_shot: bool, forward_only: bool, load_train: bool, load_reference: bool
-    ) -> List[Dict]:
+    def load(path: str, logger: DistributedLogger, few_shot: bool, *args, **kwargs) -> List[Dict]:
         dataset = {"test": {}}
 
         files = glob.glob(os.path.join(path, "*.jsonl"))
diff --git a/applications/ColossalEval/colossal_eval/dataset/base.py b/applications/ColossalEval/colossal_eval/dataset/base.py
index 531313d7e3c0..a29f56fd1998 100644
--- a/applications/ColossalEval/colossal_eval/dataset/base.py
+++ b/applications/ColossalEval/colossal_eval/dataset/base.py
@@ -1,6 +1,9 @@
 from abc import abstractstaticmethod
 
 from colossal_eval.utils import jdump
+from torch.utils.data import Dataset
+
+from colossalai.logging import DistributedLogger
 
 
 class BaseDataset:
@@ -12,13 +15,24 @@ class BaseDataset:
         logger: Logger for the dataset.
     """
 
-    def __init__(self, path, logger, few_shot, forward_only=False, load_train=False, load_reference=False):
-        self.dataset = self.load(path, logger, few_shot, forward_only, load_train, load_reference)
+    def __init__(self, path, logger, *args, **kwargs):
+        self.dataset = self.load(path, logger, *args, **kwargs)
 
     def save(self, save_path):
         """Save the converted dataset"""
         jdump(self.dataset, save_path)
 
     @abstractstaticmethod
-    def load(path, logger):
+    def load(path, logger: DistributedLogger, *args, **kwargs):
         """Load the original dataset and convert it into the inference dataset"""
+
+
+class DistributedDataset(Dataset):
+    def __init__(self, data):
+        self.data = data
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        return self.data[idx]
diff --git a/applications/ColossalEval/colossal_eval/dataset/ceval.py b/applications/ColossalEval/colossal_eval/dataset/ceval.py
index 915f4d9b0850..1023d1e23c1f 100644
--- a/applications/ColossalEval/colossal_eval/dataset/ceval.py
+++ b/applications/ColossalEval/colossal_eval/dataset/ceval.py
@@ -90,9 +90,7 @@ class CEvalDataset(BaseDataset):
     """
 
     @staticmethod
-    def load(
-        path: str, logger: DistributedLogger, few_shot: bool, forward_only: bool, load_train: bool, load_reference: bool
-    ) -> List[Dict]:
+    def load(path: str, logger: DistributedLogger, few_shot: bool, *args, **kwargs) -> List[Dict]:
         dataset = {"dev": {}, "test": {}}
         for split in ["dev", "test"]:
             files = os.listdir(os.path.join(path, split))
diff --git a/applications/ColossalEval/colossal_eval/dataset/cmmlu.py b/applications/ColossalEval/colossal_eval/dataset/cmmlu.py
index 477280663218..05752c2486fa 100644
--- a/applications/ColossalEval/colossal_eval/dataset/cmmlu.py
+++ b/applications/ColossalEval/colossal_eval/dataset/cmmlu.py
@@ -101,9 +101,7 @@ class CMMLUDataset(BaseDataset):
     """
 
     @staticmethod
-    def load(
-        path: str, logger: DistributedLogger, few_shot: bool, forward_only: bool, load_train: bool, load_reference: bool
-    ) -> List[Dict]:
+    def load(path: str, logger: DistributedLogger, few_shot: bool, *args, **kwargs) -> List[Dict]:
         dataset = {"dev": {}, "test": {}}
         for split in ["dev", "test"]:
             files = os.listdir(os.path.join(path, split))
diff --git a/applications/ColossalEval/colossal_eval/dataset/colossalai.py b/applications/ColossalEval/colossal_eval/dataset/colossalai.py
index 54ea478ae5d6..0337454fa788 100644
--- a/applications/ColossalEval/colossal_eval/dataset/colossalai.py
+++ b/applications/ColossalEval/colossal_eval/dataset/colossalai.py
@@ -37,7 +37,7 @@ class ColossalDataset(BaseDataset):
     """
 
     @staticmethod
-    def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+    def load(path: str, logger: DistributedLogger, *args, **kwargs) -> List[Dict]:
         dataset = {"test": {}}
         data = jload(path)
         data_per_category = get_data_per_category(data)
diff --git a/applications/ColossalEval/colossal_eval/dataset/cvalues.py b/applications/ColossalEval/colossal_eval/dataset/cvalues.py
index 30e802a028c8..4023a4c76322 100644
--- a/applications/ColossalEval/colossal_eval/dataset/cvalues.py
+++ b/applications/ColossalEval/colossal_eval/dataset/cvalues.py
@@ -28,7 +28,7 @@ class CValuesDataset(BaseDataset):
     """
 
     @staticmethod
-    def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+    def load(path: str, logger: DistributedLogger, *args, **kwargs) -> List[Dict]:
         dataset = {"test": {}}
         file_path = os.path.join(path, "cvalues_responsibility_mc.jsonl")
         data_list = []
diff --git a/applications/ColossalEval/colossal_eval/dataset/gaokaobench.py b/applications/ColossalEval/colossal_eval/dataset/gaokaobench.py
index cda6276bfe05..44ccea9cfa2c 100644
--- a/applications/ColossalEval/colossal_eval/dataset/gaokaobench.py
+++ b/applications/ColossalEval/colossal_eval/dataset/gaokaobench.py
@@ -69,9 +69,7 @@ class GaoKaoBenchDataset(BaseDataset):
     """
 
     @staticmethod
-    def load(
-        path: str, logger: DistributedLogger, few_shot: bool, forward_only: bool, load_train: bool, load_reference: bool
-    ) -> List[Dict]:
+    def load(path: str, logger: DistributedLogger, *args, **kwargs) -> List[Dict]:
         dataset = {"test": {}}
         for category in ["Fill-in-the-blank_Questions", "Multiple-choice_Questions", "Open-ended_Questions"]:
             files = os.listdir(os.path.join(path, "data", category))
diff --git a/applications/ColossalEval/colossal_eval/dataset/longbench.py b/applications/ColossalEval/colossal_eval/dataset/longbench.py
index 9ea5e3c7d77f..eb61efaa0d7c 100644
--- a/applications/ColossalEval/colossal_eval/dataset/longbench.py
+++ b/applications/ColossalEval/colossal_eval/dataset/longbench.py
@@ -77,7 +77,7 @@ class LongBenchDataset(BaseDataset):
     """
 
     @staticmethod
-    def load(path: str, logger: DistributedLogger) -> List[Dict]:
+    def load(path: str, logger: DistributedLogger, *args, **kwargs) -> List[Dict]:
         dataset = {"test": {}}
 
         files = os.listdir(path)
diff --git a/applications/ColossalEval/colossal_eval/dataset/mmlu.py b/applications/ColossalEval/colossal_eval/dataset/mmlu.py
index dcda68e8f5ac..e9465c91b3ce 100644
--- a/applications/ColossalEval/colossal_eval/dataset/mmlu.py
+++ b/applications/ColossalEval/colossal_eval/dataset/mmlu.py
@@ -31,9 +31,7 @@ class MMLUDataset(BaseDataset):
     """
 
     @staticmethod
-    def load(
-        path: str, logger: DistributedLogger, few_shot: bool, forward_only: bool, load_train: bool, load_reference: bool
-    ) -> List[Dict]:
+    def load(path: str, logger: DistributedLogger, few_shot: bool, *args, **kwargs) -> List[Dict]:
         dataset = {"dev": {}, "test": {}}
         for split in ["dev", "test"]:
             files = os.listdir(os.path.join(path, split))
diff --git a/applications/ColossalEval/colossal_eval/dataset/mtbench.py b/applications/ColossalEval/colossal_eval/dataset/mtbench.py
index 03141556788f..ef474ec4ca23 100644
--- a/applications/ColossalEval/colossal_eval/dataset/mtbench.py
+++ b/applications/ColossalEval/colossal_eval/dataset/mtbench.py
@@ -27,12 +27,12 @@ class MTBenchDataset(BaseDataset):
     This dataset class will convert the original dataset into the inference dataset.
     """
 
-    def __init__(self, path, logger, few_shot):
+    def __init__(self, path, logger: DistributedLogger, *args, **kwargs):
         self.multiturn = True
-        self.dataset = self.load(path, logger, few_shot)
+        self.dataset = self.load(path, logger, *args, **kwargs)
 
     @staticmethod
-    def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+    def load(path: str, logger: DistributedLogger, *args, **kwargs) -> List[Dict]:
         dataset = {"test": defaultdict(dict)}
 
         file_path = os.path.join(path, "question.jsonl")
diff --git a/applications/ColossalEval/colossal_eval/dataset/safetybench_en.py b/applications/ColossalEval/colossal_eval/dataset/safetybench_en.py
index e77a3da34060..8056c3dfd8bf 100644
--- a/applications/ColossalEval/colossal_eval/dataset/safetybench_en.py
+++ b/applications/ColossalEval/colossal_eval/dataset/safetybench_en.py
@@ -130,7 +130,7 @@ class SafetyBenchENDataset(BaseDataset):
     """
 
     @staticmethod
-    def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+    def load(path: str, logger: DistributedLogger, few_shot: bool, *args, **kwargs) -> List[Dict]:
         dataset = {"dev": {}, "test": {}}
         data_files = [os.path.join(path, file_name) for file_name in FILES]
         for file_path in data_files:
diff --git a/applications/ColossalEval/colossal_eval/dataset/safetybench_zh.py b/applications/ColossalEval/colossal_eval/dataset/safetybench_zh.py
index 3eca808bbc5b..f5f17e64c991 100644
--- a/applications/ColossalEval/colossal_eval/dataset/safetybench_zh.py
+++ b/applications/ColossalEval/colossal_eval/dataset/safetybench_zh.py
@@ -130,7 +130,7 @@ class SafetyBenchZHDataset(BaseDataset):
     """
 
     @staticmethod
-    def load(path: str, logger: DistributedLogger, few_shot: bool) -> List[Dict]:
+    def load(path: str, logger: DistributedLogger, few_shot: bool, *args, **kwargs) -> List[Dict]:
         dataset = {"dev": {}, "test": {}}
         data_files = [os.path.join(path, file_name) for file_name in FILES]
         for file_path in data_files:
diff --git a/applications/ColossalEval/colossal_eval/models/huggingface.py b/applications/ColossalEval/colossal_eval/models/huggingface.py
index 23c399ccedbd..e91743525f0e 100644
--- a/applications/ColossalEval/colossal_eval/models/huggingface.py
+++ b/applications/ColossalEval/colossal_eval/models/huggingface.py
@@ -1,11 +1,11 @@
 import copy
-import math
 from typing import Any, Dict, List, Optional, Tuple
 
 import numpy as np
 import torch
 from colossal_eval.utils import Conversation, get_batch_prompt, is_rank_0
 from peft import PeftModel
+from torch.utils.data import DataLoader
 from tqdm import tqdm
 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer
 
@@ -130,7 +130,7 @@ def _load_model(
         if shard_config is not None:
             self.model = AutoModel.from_pretrained(path, **model_kwargs)
             shard_former = ShardFormer(shard_config)
-            self.model, sharded_parameters = shard_former.optimize(self.model)
+            self.model, _ = shard_former.optimize(self.model)
             self.model.to(get_current_device())
 
             if peft_path is not None:
@@ -325,7 +325,7 @@ def _get_input_ids_and_labels(
 
         return input_ids_list, labels_list, None
 
-    def inference(self, data: List[Dict], inference_kwargs: Dict[str, Any], debug: bool = False) -> List[Dict]:
+    def inference(self, data_loader: DataLoader, inference_kwargs: Dict[str, Any], debug: bool = False) -> List[Dict]:
         """
         Infer the given data.
         This function will call self.generate() to get model outputs and also self.model() to get logits.
@@ -359,26 +359,23 @@ def inference(self, data: List[Dict], inference_kwargs: Dict[str, Any], debug: b
 
             self.str_label_map = {choice: idx for idx, choice in enumerate(self.choices)}
 
-        turn = 0 if not isinstance(data[0]["output"], list) else len(data[0]["output"]) + 1
-        turn_desc = "" if turn == 0 else f"-turn{turn}"
-
         bar = tqdm(
-            range(math.ceil(len(data) / self.batch_size)),
-            desc=f"{data[0]['dataset']}-{data[0]['category']}{turn_desc} Inference steps",
+            range(len(data_loader)),
+            desc=f"{inference_kwargs['dataset']}-{inference_kwargs['category']} Inference steps",
             disable=not is_rank_0(),
         )
         loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
 
-        answers = copy.deepcopy(data)
-        for i in range(0, len(data), self.batch_size):
-            batch = data[i : i + self.batch_size]
+        answers = []
+
+        for i, batch in enumerate(data_loader):
             batch_prompt, batch_target = get_batch_prompt(
-                self.prompt_template, batch, few_shot_data, self.tokenizer, language, self.model_max_length
+                self.prompt_template, batch, few_shot_data, self.tokenizer, self.model_max_length
             )
 
             if is_rank_0() and debug and i == 0:
                 self.logger.info(
-                    f"Inference arguments for dataset {data[0]['dataset']} category {data[0]['category']} is:\n{inference_kwargs}"
+                    f"Inference arguments for dataset {batch[0]['dataset']} category {batch[0]['category']} is:\n{inference_kwargs}"
                 )
                 self.logger.info("-" * 120)
                 self.logger.info("An example prompt and prompt with target is:")
@@ -402,7 +399,7 @@ def inference(self, data: List[Dict], inference_kwargs: Dict[str, Any], debug: b
                 # Otherwise this will violate the single-choice setting.
 
                 if calculate_loss:
-                    labels = [self.str_label_map[answers[i + j]["target"]] for j in range(len(batch_decodes))]
+                    labels = [self.str_label_map[batch[j]["target"]] for j in range(len(batch))]
 
                     loss_over_choices = loss_fct(scores, torch.tensor(labels, dtype=torch.long)).numpy().tolist()
 
@@ -411,29 +408,30 @@ def inference(self, data: List[Dict], inference_kwargs: Dict[str, Any], debug: b
                     {choice: probs[i][self.str_label_map[choice]] for choice in self.choices} for i in range(len(probs))
                 ]
 
-            for j in range(len(batch_prompt)):
+            for j in range(len(batch)):
                 if not pretrain:
-                    if isinstance(answers[i + j]["output"], list):
-                        answers[i + j]["output"].append(batch_decodes[j].strip())
+                    if isinstance(batch[j]["output"], list):
+                        batch[j]["output"].append(batch_decodes[j].strip())
                     else:
-                        answers[i + j]["output"] = batch_decodes[j].strip()
+                        batch[j]["output"] = batch_decodes[j].strip()
 
                     if isinstance(scores, torch.Tensor):
-                        answers[i + j]["logits_over_choices"] = probs[j]
+                        batch[j]["logits_over_choices"] = probs[j]
 
                         if calculate_loss:
-                            answers[i + j]["loss_over_choices"] = loss_over_choices[j]
+                            batch[j]["loss_over_choices"] = loss_over_choices[j]
 
                 if calculate_loss:
-                    answers[i + j]["loss"] = (np.array(batch_losses[j]) / np.array(batch_target_token_nums[j])).tolist()
+                    batch[j]["loss"] = (np.array(batch_losses[j]) / np.array(batch_target_token_nums[j])).tolist()
 
                     # loss_sum is specially used for pertrain dataset for calculating per-byte-perplexity.
                     # However, loss (which is per sample loss) suffices for most cases.
-                    answers[i + j]["loss_sum"] = batch_losses[j]
-                    answers[i + j]["token_num"] = batch_target_token_nums[j]
+                    batch[j]["loss_sum"] = batch_losses[j]
+                    batch[j]["token_num"] = batch_target_token_nums[j]
 
                     if batch_bytes_nums:
-                        answers[i + j]["byte_num"] = batch_bytes_nums[j]
+                        batch[j]["byte_num"] = batch_bytes_nums[j]
+            answers.extend(batch)
 
             bar.update()
 
@@ -600,7 +598,7 @@ def _load_model(
         if shard_config is not None:
             self.model = AutoModelForCausalLM.from_pretrained(path, **model_kwargs)
             shard_former = ShardFormer(shard_config)
-            self.model, sharded_parameters = shard_former.optimize(self.model)
+            self.model, _ = shard_former.optimize(self.model)
             self.model.to(get_current_device())
 
             if peft_path is not None:
diff --git a/applications/ColossalEval/colossal_eval/utils/conversation.py b/applications/ColossalEval/colossal_eval/utils/conversation.py
index 330083aa6a61..c0445e84ec76 100644
--- a/applications/ColossalEval/colossal_eval/utils/conversation.py
+++ b/applications/ColossalEval/colossal_eval/utils/conversation.py
@@ -123,15 +123,13 @@ def dict(self):
         }
 
 
-def get_few_shot_prefix(
-    conv: Conversation, few_shot_data: List[str], tokenizer: Optional[AutoTokenizer], language: str, max_tokens: int
-) -> str:
+def get_few_shot_prefix(few_shot_data: List[str], tokenizer: Optional[AutoTokenizer], max_tokens: int) -> str:
     """
     Get few shot prefix.
 
     Args:
-        conv: Conversation template.
-        few_shot_examples: Few shot examples to generate few shot prompt prefix.
+        few_shot_data: Few shot examples to generate few shot prompt prefix.
+        tokenizer: tokenizer used to tokenize data.
 
     Returns:
         Few shot prompt prefix.
@@ -157,7 +155,6 @@ def get_batch_prompt(
     batch: List[Dict],
     few_shot_data: List[str],
     tokenizer: Optional[AutoTokenizer],
-    language: Optional[str],
     model_max_length: Optional[int],
 ) -> Tuple[List[Dict], List[Dict]]:
     """
@@ -167,6 +164,7 @@ def get_batch_prompt(
         conv: Conversation template.
         batch: Batch data to generate prompt from.
         few_shot_data: Few shot data to generate few shot prompt prefix.
+        tokenizer: tokenizer used to tokenize data.
 
     Returns:
         Tuple containg batch prompt and target.
@@ -192,7 +190,7 @@ def get_batch_prompt(
                 else:
                     raise Exception("When using few-shot, target answer should be a string.")
 
-                few_shot_prefix = get_few_shot_prefix(conv, few_shot_data, tokenizer, language, max_tokens)
+                few_shot_prefix = get_few_shot_prefix(few_shot_data, tokenizer, max_tokens)
 
                 conv.append_message(conv.roles[0], few_shot_prefix + query_text)
                 conv.append_message(conv.roles[1], None)
diff --git a/applications/ColossalEval/examples/dataset_evaluation/inference.py b/applications/ColossalEval/examples/dataset_evaluation/inference.py
index a7307635d333..c651970ee37c 100644
--- a/applications/ColossalEval/examples/dataset_evaluation/inference.py
+++ b/applications/ColossalEval/examples/dataset_evaluation/inference.py
@@ -5,6 +5,8 @@
 
 import torch.distributed as dist
 from colossal_eval import dataset, models, utils
+from colossal_eval.dataset.base import DistributedDataset
+from torch.utils.data import DataLoader, DistributedSampler
 
 import colossalai
 from colossalai.accelerator import get_accelerator
@@ -13,6 +15,7 @@
 from colossalai.shardformer import ShardConfig
 
 logger = get_dist_logger()
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 
 def rm_and_merge(
@@ -54,7 +57,8 @@ def rm_and_merge(
                         )
                     else:
                         rank_answers = utils.jload(directory)
-                        answers["data"].extend(rank_answers["data"])
+                        deduplidate_answers = [x for x in rank_answers["data"] if x not in answers["data"]]
+                        answers["data"].extend(deduplidate_answers)
                         answers["inference_kwargs"] = rank_answers["inference_kwargs"]
 
                 for r in range(dp_size):
@@ -65,7 +69,7 @@ def rm_and_merge(
                         os.remove(directory)
                     except Exception as e:
                         print(e)
-
+                print(len(answers["data"]))
                 all_answers[category] = answers
 
             all_answers_with_dataset_class["inference_results"] = all_answers
@@ -108,7 +112,12 @@ def main(args):
     tp_rank = coordinates[TP_AXIS]
 
     shard_config = (
-        ShardConfig(tensor_parallel_process_group=tp_group, enable_tensor_parallelism=args.tp_size > 1)
+        ShardConfig(
+            tensor_parallel_process_group=tp_group,
+            enable_tensor_parallelism=args.tp_size > 1,
+            parallel_output=False,
+            enable_all_optimization=True,
+        )
         if args.tp_size > 1
         else None
     )
@@ -183,6 +192,7 @@ def main(args):
         model_name = model_parameter["name"]
         model_class = eval(f"models.{model_parameter['model_class']}")
         paramerters = model_parameter["parameters"]
+        batch_size = paramerters["batch_size"]
         paramerters.update({"logger": logger})
         paramerters.update({"prompt_template": utils.prompt_templates[paramerters["prompt_template"]]})
         paramerters.update({"shard_config": shard_config})
@@ -192,7 +202,6 @@ def main(args):
             raise ValueError(f"Model class {model_parameter['model_class']} is not a subclass of BaseModel.")
 
         for dataset_name, split_data in inference_data.items():
-            start = 0
             prev_questions = None
             for category, category_data in split_data.items():
                 num_turn = category_data["inference_kwargs"].get("turns", 1)
@@ -201,26 +210,33 @@ def main(args):
                     raise Exception(f"Dataset {dataset_name} doesn't have few-shot data for category {category}!")
 
                 answers_to_dump = copy.deepcopy(category_data)
-                partition_size = len(category_data["data"]) // dp_size
-                redundant = len(category_data["data"]) % dp_size
-
-                # Ensure that the amount of data for inference is as consistent as possible across different processes.
-                lengths = [partition_size for _ in range(dp_size)]
-                for j in range(redundant):
-                    lengths[(j + start) % dp_size] += 1
-
-                start = (start + redundant) % dp_size
-
                 for turn in range(num_turn):
                     if turn == 0:
-                        questions = category_data["data"][
-                            sum(lengths[0:dp_rank]) : sum(lengths[0:dp_rank]) + lengths[dp_rank]
-                        ]
+                        dist_dataset = DistributedDataset(category_data["data"])
                     else:
-                        questions = prev_questions
+                        dist_dataset = DistributedDataset(prev_questions)
+
+                    sampler = DistributedSampler(
+                        dist_dataset,
+                        num_replicas=pg_mesh.size(DP_AXIS),
+                        rank=pg_mesh.coordinate(DP_AXIS),
+                        shuffle=False,
+                    )
+                    questions_loader = DataLoader(
+                        dist_dataset,
+                        batch_size=batch_size,
+                        sampler=sampler,
+                        num_workers=8,
+                        pin_memory=True,
+                        collate_fn=lambda x: x,
+                    )
+                    category_data["inference_kwargs"]["dataset"] = dataset_name
+                    category_data["inference_kwargs"]["category"] = category
 
                     answers_per_rank = model_.inference(
-                        questions, inference_kwargs=category_data["inference_kwargs"], debug=debug_args[dataset_name]
+                        data_loader=questions_loader,
+                        inference_kwargs=category_data["inference_kwargs"],
+                        debug=debug_args[dataset_name],
                     )
                     prev_questions = answers_per_rank
 
diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index c210ca91e68a..bf0788650811 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -2,7 +2,7 @@
 import random
 import warnings
 from collections import defaultdict
-from contextlib import contextmanager
+from contextlib import contextmanager, nullcontext
 from copy import deepcopy
 from functools import partial
 from types import MethodType
@@ -30,11 +30,15 @@
 from colossalai.nn.optimizer import DistGaloreAwamW, cast_to_distributed
 from colossalai.pipeline.schedule import InterleavedSchedule, OneForwardOneBackwardSchedule
 from colossalai.pipeline.stage_manager import PipelineStageManager
+from colossalai.quantization import BnbQuantizationConfig, quantize_model
 from colossalai.shardformer import GradientCheckpointConfig, ShardConfig, ShardFormer
 from colossalai.shardformer.layer.utils import SeqParallelUtils
 from colossalai.shardformer.policies.base_policy import Policy
+from colossalai.tensor.colo_parameter import ColoParameter
 from colossalai.tensor.d_tensor.api import is_distributed_tensor
+from colossalai.tensor.param_op_hook import ColoParamOpHookManager
 from colossalai.zero.low_level import LowLevelZeroOptimizer
+from colossalai.zero.low_level.zero_hook import ZeroOpHook, wait_all_gather_handle
 
 from .pp_plugin_base import PipelinePluginBase
 
@@ -61,6 +65,7 @@ def __init__(
         use_ddp: bool,
         ddp_config: dict,
         custom_policy: Policy,
+        overlap_allgather: bool = False,
     ) -> None:
         self.stage_manager = shard_config.pipeline_stage_manager
         self.shard_config = shard_config
@@ -69,6 +74,7 @@ def __init__(
         self.sp_group = sp_group
         self.use_dpp = use_ddp
         self.require_grad_sync = True
+        self.overlap_allgather = overlap_allgather
 
         shardformer = ShardFormer(shard_config)
         if custom_policy is not None:
@@ -106,6 +112,12 @@ def __init__(
             module = DDP(module, process_group=dp_group, **ddp_config)
 
         super().__init__(module)
+        if overlap_allgather:
+            self.op_hook = ZeroOpHook()
+            for p in module.parameters():
+                if p.requires_grad and type(p) is not ColoParameter:
+                    p.__class__ = ColoParameter
+                    p.__init__(p, requires_grad=True)
 
     def sync_shared_params(self):
         for shared_param, group in zip(self.shared_params, self.shared_param_process_groups):
@@ -197,7 +209,8 @@ def forward(self, *args, **kwargs):
         if self.convert_fn is not None:
             args = tree_map(self.convert_fn, args)
             kwargs = tree_map(self.convert_fn, kwargs)
-        return super().forward(*args, **kwargs)
+        with self._wait_all_gather():
+            return super().forward(*args, **kwargs)
 
     def unwrap(self):
         module = super().unwrap()
@@ -205,6 +218,13 @@ def unwrap(self):
             module = module.module
         return module
 
+    def _force_wait_all_gather(self):
+        for p in self.module.parameters():
+            wait_all_gather_handle(p)
+
+    def _wait_all_gather(self):
+        return ColoParamOpHookManager.use_hooks(self.op_hook) if self.overlap_allgather else nullcontext()
+
 
 def get_param_info(optim: Optimizer):
     # Get a backup of necessary information of parameters for future use, which includes:
@@ -235,7 +255,7 @@ def get_param_info(optim: Optimizer):
     return param_info
 
 
-def init_pipeline_optimizer(optim: Optimizer, model: Module):
+def reinitialize_optimizer(optim: Optimizer, model: Module):
     model_params = set(model.parameters())
     new_param_groups = []
     for group in optim.param_groups:
@@ -257,7 +277,7 @@ def __init__(
     ):
         self.param_info = param_info
         if use_pipeline:
-            init_pipeline_optimizer(optim, model)
+            reinitialize_optimizer(optim, model)
         self.model = model
         self.stage_manager = model.stage_manager
         self.shared_params = model.shared_params
@@ -478,7 +498,7 @@ def __init__(
         self.tp_size = get_world_size(self.tp_pg) if self.tp_pg is not None else 1
         self.pp_size = get_world_size(self.pp_pg) if self.pp_pg is not None else 1
         if use_pipeline:
-            init_pipeline_optimizer(optim, model)
+            reinitialize_optimizer(optim, model)
         super().__init__(
             optim,
             precision=precision,
@@ -632,6 +652,7 @@ def __init__(
         model: HybridParallelModule,
         use_pipeline: bool,
         param_info: OrderedDict,
+        pg_to_param_list: Dict[ProcessGroup, List[torch.nn.Parameter]] = None,
         initial_scale: int = 2**16,  # grad scaler config
         min_scale: int = 1,
         growth_factor: float = 2.0,
@@ -650,6 +671,7 @@ def __init__(
         tp_process_group: Optional[ProcessGroup] = None,  # if using tp
         pp_process_group: Optional[ProcessGroup] = None,  # if using pp
         forced_dtype: Optional[torch.dtype] = None,
+        overlap_allgather: bool = False,
     ):
         self.model = model
         self.param_info = param_info
@@ -658,11 +680,12 @@ def __init__(
         self.tp_pg = tp_process_group
         self.pp_pg = pp_process_group
         if use_pipeline:
-            init_pipeline_optimizer(optimizer, model)
+            reinitialize_optimizer(optimizer, model)
         super().__init__(
             optimizer=optimizer,
             initial_scale=initial_scale,
             min_scale=min_scale,
+            pg_to_param_list=pg_to_param_list,
             growth_factor=growth_factor,
             backoff_factor=backoff_factor,
             growth_interval=growth_interval,
@@ -677,6 +700,7 @@ def __init__(
             cpu_offload=cpu_offload,
             dp_process_group=dp_process_group,
             forced_dtype=forced_dtype,
+            overlap_allgather=overlap_allgather,
         )
 
     def sync_dp_grads(self):
@@ -993,9 +1017,11 @@ def __init__(
         make_vocab_size_divisible_by: int = 64,
         dp_outside: bool = True,
         overlap_p2p: bool = True,
+        overlap_allgather: bool = False,
         fp8_communication: bool = False,
     ) -> None:
         super().__init__()
+
         assert (
             dist.get_world_size() % (tp_size * pp_size) == 0
         ), f"World size {dist.get_world_size()} is not divisible by tp_size {tp_size} * pp_size {pp_size}"
@@ -1038,17 +1064,7 @@ def __init__(
         self.enable_jit_fused = enable_jit_fused
         self.enable_sequence_parallelism = enable_sequence_parallelism
         if dp_outside:
-            (
-                self.dp_axis,
-                self.pp_axis,
-                self.tp_axis,
-                self.sp_axis,
-            ) = (
-                0,
-                1,
-                2,
-                3,
-            )
+            self.dp_axis, self.pp_axis, self.tp_axis, self.sp_axis = 0, 1, 2, 3
             self.pg_mesh = ProcessGroupMesh(self.dp_size, self.pp_size, self.tp_size, self.sp_size)
         else:
             self.pp_axis, self.dp_axis, self.tp_axis, self.sp_axis = 0, 1, 2, 3
@@ -1148,6 +1164,7 @@ def __init__(
             cpu_offload=cpu_offload,
             partition_grad=(self.zero_stage == 2),
             forced_dtype=PRECISION_TORCH_TYPE[precision],
+            overlap_allgather=overlap_allgather,
         )
 
         self.max_norm = max_norm
@@ -1176,7 +1193,7 @@ def support_no_sync(self) -> bool:
         return True
 
     def support_lora(self) -> bool:
-        return False
+        return True
 
     def control_checkpoint_io(self) -> bool:
         return True
@@ -1210,6 +1227,7 @@ def configure(
                 and self.enable_sequence_parallelism
                 and self.sequence_parallelism_mode == "all_to_all"
             )
+            # sync gradients across DP * SP ranks
             if self.enable_sequence_parallelism and self.sequence_parallelism_mode == "all_to_all":
                 dp_group = self.pg_mesh.create_group_along_axis([self.dp_axis, self.sp_axis])
             else:
@@ -1224,6 +1242,7 @@ def configure(
                 use_ddp=use_ddp,
                 ddp_config=self.ddp_config,
                 custom_policy=self.custom_policy,
+                overlap_allgather=(self.zero_stage > 0 and self.zero_config["overlap_allgather"]),
             )
         if optimizer is not None and not isinstance(optimizer, OptimizerWrapper):
             if zero_stage == 0:
@@ -1306,7 +1325,7 @@ def execute_pipeline(
         # so we disable it, performing manual reduction instead.
         ctx = optimizer.no_sync() if isinstance(optimizer, HybridParallelZeroOptimizer) else model.no_sync()
 
-        with ctx:
+        with ctx, model._wait_all_gather():
             outputs = self.schedule.forward_backward_step(
                 model, data_iter, criterion, optimizer, return_loss, return_outputs
             )
@@ -1362,15 +1381,15 @@ def prepare_dataloader(
             kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
                     `DataLoader <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html#DataLoader>`_.
 
-        Returns:
+        Returns:`
             :class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
         """
         _kwargs = kwargs.copy()
         distributed_sampler_cls = distributed_sampler_cls or DistributedSampler
         sampler = distributed_sampler_cls(
             dataset,
-            num_replicas=self.pg_mesh.size(self.dp_axis),
-            rank=self.pg_mesh.coordinate(self.dp_axis),
+            num_replicas=self.dp_group.size(),
+            rank=dist.get_group_rank(self.dp_group, global_rank=dist.get_rank()),
             shuffle=shuffle,
         )
 
@@ -1402,6 +1421,24 @@ def no_sync(self, model: Module, optimizer: OptimizerWrapper) -> Iterator[None]:
         return optimizer.no_sync() if isinstance(optimizer, HybridParallelZeroOptimizer) else model.no_sync()
 
     def enable_lora(
-        self, model: Module, pretrained_dir: Optional[str] = None, lora_config: Optional[Dict] = None
+        self,
+        model: Module,
+        pretrained_dir: Optional[str] = None,
+        lora_config: Optional[Dict] = None,
+        bnb_quantization_config: Optional[BnbQuantizationConfig] = None,
     ) -> Module:
-        raise NotImplementedError
+        from peft import PeftModel, get_peft_model
+
+        assert not isinstance(model, HybridParallelModule), "Lora should be enabled before boosting the model."
+        assert self.pp_size == 1 and self.tp_size == 1
+        self.lora_enabled = True
+        warnings.warn("You have enabled LoRa training. Please check the hyperparameters such as lr")
+
+        if bnb_quantization_config is not None:
+            model = quantize_model(model, bnb_quantization_config)
+
+        if pretrained_dir is None:
+            peft_model = get_peft_model(model, lora_config)
+        else:
+            peft_model = PeftModel.from_pretrained(model, pretrained_dir, is_trainable=True)
+        return peft_model
diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py
index bfb8930bb99c..64f264f7eba1 100644
--- a/colossalai/booster/plugin/low_level_zero_plugin.py
+++ b/colossalai/booster/plugin/low_level_zero_plugin.py
@@ -2,6 +2,7 @@
 import logging
 import os
 import warnings
+from contextlib import nullcontext
 from functools import partial
 from pathlib import Path
 from types import MethodType
@@ -34,7 +35,10 @@
 from colossalai.interface.optimizer import DistributedOptim
 from colossalai.nn.optimizer import DistGaloreAwamW, cast_to_distributed
 from colossalai.quantization import BnbQuantizationConfig, quantize_model
+from colossalai.tensor.colo_parameter import ColoParameter
+from colossalai.tensor.param_op_hook import ColoParamOpHookManager
 from colossalai.zero import LowLevelZeroOptimizer
+from colossalai.zero.low_level.zero_hook import ZeroOpHook, wait_all_gather_handle
 
 from .dp_plugin_base import DPPluginBase
 from .torch_ddp_plugin import TorchDDPCheckpointIO
@@ -58,7 +62,7 @@ class OptimizerParamCheckState(enum.Enum):
 
 
 class LowLevelZeroModel(ModelWrapper, AMPModelMixin):
-    def __init__(self, module: nn.Module, precision: str) -> None:
+    def __init__(self, module: nn.Module, precision: str, overlap_allgather: bool = False) -> None:
         super().__init__(module)
         self.dtype = None
         if precision == "fp16":
@@ -72,12 +76,25 @@ def __init__(self, module: nn.Module, precision: str) -> None:
         self.convert_fn = None
         if self.dtype is not None:
             self.convert_fn = partial(_convert_floating_point, dtype=self.dtype)
+        self.overlap_allgather = overlap_allgather
+        if overlap_allgather:
+            self.op_hook = ZeroOpHook()
+            for p in module.parameters():
+                if p.requires_grad and type(p) is not ColoParameter:
+                    p.__class__ = ColoParameter
+                    p.__init__(p, requires_grad=True)
 
     def forward(self, *args, **kwargs):
         if self.convert_fn is not None:
             args = tree_map(self.convert_fn, args)
             kwargs = tree_map(self.convert_fn, kwargs)
-        return super().forward(*args, **kwargs)
+        ctx = ColoParamOpHookManager.use_hooks(self.op_hook) if self.overlap_allgather else nullcontext()
+        with ctx:
+            return super().forward(*args, **kwargs)
+
+    def _force_wait_all_gather(self):
+        for p in self.module.parameters():
+            wait_all_gather_handle(p)
 
 
 class LowLevelZeroCheckpointIO(TorchDDPCheckpointIO):
@@ -209,6 +226,7 @@ def load_sharded_optimizer(self, optimizer: OptimizerWrapper, index_file_path: s
 
     def load_unsharded_model(self, model: ModelWrapper, checkpoint: str, strict: bool = True):
         assert isinstance(model, LowLevelZeroModel), "Please boost the model before loading!"
+        model._force_wait_all_gather()
         super().load_unsharded_model(model, checkpoint, strict)
         model.update_master_params()
 
@@ -221,9 +239,30 @@ def load_sharded_model(
         load_sub_module: bool = True,
     ):
         assert isinstance(model, LowLevelZeroModel), "Please boost the model before loading!"
+        model._force_wait_all_gather()
         super().load_sharded_model(model, checkpoint_index_file, strict, use_safetensors, load_sub_module)
         model.update_master_params()
 
+    def save_unsharded_model(self, model: ModelWrapper, checkpoint: str, gather_dtensor: bool, use_safetensors: bool):
+        assert isinstance(model, LowLevelZeroModel), "Please boost the model before loading!"
+        model._force_wait_all_gather()
+        return super().save_unsharded_model(model, checkpoint, gather_dtensor, use_safetensors)
+
+    def save_sharded_model(
+        self,
+        model: ModelWrapper,
+        checkpoint_path: str,
+        gather_dtensor: bool = True,
+        prefix: Optional[str] = None,
+        max_shard_size: int = 1024,
+        use_safetensors: bool = False,
+    ):
+        assert isinstance(model, LowLevelZeroModel), "Please boost the model before loading!"
+        model._force_wait_all_gather()
+        return super().save_sharded_model(
+            model, checkpoint_path, gather_dtensor, prefix, max_shard_size, use_safetensors
+        )
+
     def save_lora_as_pretrained(self, model, checkpoint, use_safetensors):
         if os.path.isfile(checkpoint):
             logging.error(f"Provided path ({checkpoint}) should be a directory, not a file")
@@ -231,6 +270,7 @@ def save_lora_as_pretrained(self, model, checkpoint, use_safetensors):
         from peft import PeftModel
 
         assert isinstance(model, ModelWrapper), "Please boost the model before saving!"
+        model._force_wait_all_gather()
         peft_model = model.unwrap()
         assert isinstance(
             peft_model, PeftModel
@@ -290,6 +330,7 @@ def __init__(
         reduce_bucket_size_in_m: int = 12,
         communication_dtype: Optional[torch.dtype] = None,
         overlap_communication: bool = True,
+        overlap_allgather: bool = False,
         cpu_offload: bool = False,
         master_weights: bool = True,
         verbose: bool = False,
@@ -316,6 +357,7 @@ def __init__(
             partition_grad=(stage == 2),
             cpu_offload=cpu_offload,
             master_weights=master_weights,
+            overlap_allgather=overlap_allgather,
             fp8_communication=fp8_communication,
         )
         self.lora_enabled = False
@@ -406,7 +448,7 @@ def add_lora_params_to_optimizer(self, model, optimizer):
                 group_id, check_state = self.get_param_group_id(optimizer, origin_param, param)
                 if check_state == OptimizerParamCheckState.ORIGIN_PARAM_NOT_FIND:
                     warnings.warn(
-                        "Origin parameter {origin_key} related to {name} doesn't exist in optimizer param_groups."
+                        f"Origin parameter {origin_key} related to {name} doesn't exist in optimizer param_groups."
                     )
                 elif (
                     check_state == OptimizerParamCheckState.ORIGIN_PARAM_FINDED
@@ -433,7 +475,9 @@ def configure(
                 self.add_lora_params_to_optimizer(model, optimizer)
 
         if not isinstance(model, ModelWrapper):
-            model = LowLevelZeroModel(model, self.precision)
+            model = LowLevelZeroModel(
+                model, self.precision, overlap_allgather=self.zero_optim_kwargs["overlap_allgather"]
+            )
 
         # TODO: Support Galore + ZeRO
         zero_stage = self.stage
diff --git a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
index 2cfdd000a2e0..b3415af0eed6 100644
--- a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
@@ -1,9 +1,8 @@
-import random
 import warnings
+from collections import defaultdict
 from types import MethodType
-from typing import Callable, Optional, OrderedDict, Tuple
+from typing import Callable, List, Optional, OrderedDict, Tuple
 
-import numpy as np
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
@@ -11,34 +10,42 @@
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
 from torch.utils.data import DataLoader
-from torch.utils.data.distributed import DistributedSampler
 
 from colossalai.booster.plugin.hybrid_parallel_plugin import (
+    PRECISION_TORCH_TYPE,
+    SUPPORT_SP_MODE,
     HybridParallelAMPOptimizer,
     HybridParallelModule,
     HybridParallelNaiveOptimizer,
     HybridParallelPlugin,
+    HybridParallelZeroOptimizer,
     get_param_info,
-    init_pipeline_optimizer,
+    reinitialize_optimizer,
 )
 from colossalai.checkpoint_io import MoECheckpointIO
-from colossalai.cluster import ProcessGroupMesh
+from colossalai.cluster.process_group_mesh import ProcessGroupMesh
 from colossalai.interface import ModelWrapper, OptimizerWrapper
-from colossalai.logging import get_dist_logger
-from colossalai.pipeline.schedule import OneForwardOneBackwardSchedule
+from colossalai.interface.optimizer import DistributedOptim
+from colossalai.nn.optimizer import cast_to_distributed
+from colossalai.pipeline.schedule.interleaved_pp import InterleavedSchedule
+from colossalai.pipeline.schedule.one_f_one_b import OneForwardOneBackwardSchedule
 from colossalai.pipeline.stage_manager import PipelineStageManager
-from colossalai.shardformer import ShardConfig
 from colossalai.shardformer.policies.base_policy import Policy
+from colossalai.shardformer.shard.grad_ckpt_config import GradientCheckpointConfig
+from colossalai.shardformer.shard.shard_config import ShardConfig
 from colossalai.tensor.moe_tensor.api import is_moe_tensor
-from colossalai.zero.low_level import LowLevelZeroOptimizer
 
 
-class MoeHybridParallelZeroOptimizer(LowLevelZeroOptimizer):
+class MoeHybridParallelZeroOptimizer(HybridParallelZeroOptimizer):
     def __init__(
         self,
         optimizer: Optimizer,
         model: Module,
         use_pipeline: bool,
+        dp_process_group: Optional[ProcessGroup],  # the dp pg for comm
+        tp_process_group: Optional[ProcessGroup],  # if using tp
+        pp_process_group: Optional[ProcessGroup],  # if using pp
+        moe_dp_group: ProcessGroup,  # moe dp pg for comm
         param_info: OrderedDict,
         initial_scale: int = 2**16,  # grad scaler config
         min_scale: int = 1,
@@ -51,37 +58,25 @@ def __init__(
         verbose: bool = False,
         reduce_bucket_size: int = 1024 * 1024,  # communication
         communication_dtype: Optional[torch.dtype] = None,
-        overlap_communication: bool = True,
+        overlap_communication: bool = False,
         partition_grad: bool = False,  # stage 2 flag
         cpu_offload: bool = False,  # cpu offload
-        dp_process_group: Optional[ProcessGroup] = None,  # the dp pg for comm
-        tp_process_group: Optional[ProcessGroup] = None,  # if using tp
-        pp_process_group: Optional[ProcessGroup] = None,
         forced_dtype: Optional[torch.dtype] = None,
-        moe_extra_dp_process_group: Optional[ProcessGroup] = None,
+        overlap_allgather: bool = False,
     ):
-        self.param_info = param_info
-        self.stage_manager = model.stage_manager
-        self.shared_params = model.shared_params
-        self.dp_pg = dp_process_group
-        self.tp_pg = tp_process_group
-        self.pp_pg = pp_process_group
-        if use_pipeline:
-            init_pipeline_optimizer(optimizer, model)
-
         pg_param_list = {
-            dp_process_group: [],
-            moe_extra_dp_process_group: [],
+            dp_process_group: list(filter(lambda p: not is_moe_tensor(p), model.parameters())),
+            moe_dp_group: list(filter(is_moe_tensor, model.parameters())),
         }
-        for param in model.parameters():
-            if is_moe_tensor(param):
-                pg_param_list[moe_extra_dp_process_group].append(param)
-            else:
-                pg_param_list[dp_process_group].append(param)
+
+        if len(pg_param_list[dp_process_group]) == 0 or len(pg_param_list[moe_dp_group]) == 0:
+            raise ValueError("No parameters found in dp_process_group or moe_dp_group")
 
         super().__init__(
+            model=model,
             optimizer=optimizer,
-            pg_to_param_list=pg_param_list,
+            use_pipeline=use_pipeline,
+            param_info=param_info,
             initial_scale=initial_scale,
             min_scale=min_scale,
             growth_factor=growth_factor,
@@ -96,30 +91,37 @@ def __init__(
             overlap_communication=overlap_communication,
             partition_grad=partition_grad,
             cpu_offload=cpu_offload,
+            tp_process_group=tp_process_group,
+            pp_process_group=pp_process_group,
             forced_dtype=forced_dtype,
+            pg_to_param_list=pg_param_list,
+            overlap_allgather=overlap_allgather,
         )
 
 
 class MoeHybridParallelPlugin(HybridParallelPlugin):
     """
-    Plugin for Moe Hybrid Parallel Training.
+    Plugin for MoE Hybrid Parallel Training, which is similar to HybridParallelPlugin
     Tensor parallel, pipeline parallel and data parallel(DDP/ZeRO) can be picked and combined in this plugin.
     The size of tp and pp should be passed in by user, then the size of dp is automatically calculated from dp_size = world_size / (tp_size * pp_size).
 
-    Example:
-        >>> from colossalai.booster import Booster
-        >>> from colossalai.booster.plugin import HybridParallelPlugin
+    ```python
+    from colossalai.booster import Booster
+    from colossalai.booster.plugin import MoeHybridParallelPlugin
 
-        >>> model, train_dataset, optimizer, criterion = ...
-        >>> plugin =  HybridParallelPlugin(tp_size=2, pp_size=2)
+    model, train_dataset, optimizer, criterion = ...
+    plugin =  MoeHybridParallelPlugin(tp_size=2, pp_size=2, ep_size=2)
 
-        >>> train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=8)
-        >>> booster = Booster(plugin=plugin)
-        >>> model, optimizer, criterion, train_dataloader, _ = booster.boost(model, optimizer, criterion, train_dataloader)
+    train_dataloader = plugin.prepare_dataloader(train_dataset, batch_size=8)
+    booster = Booster(plugin=plugin)
+    model, optimizer, criterion, train_dataloader, _ = booster.boost(model, optimizer, criterion, train_dataloader)
+    ```
 
     Args:
-        pp_size (int): The number of pipeline stages in pipeline parallelism. Pipeline parallelism will not be used when pp_size is set to 1.
         tp_size (int): The size of tensor parallelism. Tensor parallelism will not be used when tp_size is set to 1.
+        pp_size (int): The number of pipeline stages in pipeline parallelism. Pipeline parallelism will not be used when pp_size is set to 1.
+        ep_size (int): The size of expert parallelism
+        sp_size (int): The size of sequence parallelism.
         precision (str, optional): Specifies the precision of parameters during training.
                                     Auto-mixied precision will be used when this argument is set to 'fp16' or 'bf16', otherwise model is trained with 'fp32'.
                                     Defaults to 'fp16'.
@@ -132,7 +134,9 @@ class MoeHybridParallelPlugin(HybridParallelPlugin):
         enable_flash_attention (bool, optional): Whether to switch on flash attention in Shardformer. Defaults to False.
         enable_jit_fused (bool, optional): Whether to switch on JIT in Shardformer. Default to False.
         enable_sequence_parallelism (bool): Whether to turn on sequence parallelism in Shardformer. Defaults to False.
+        sequence_parallelism_mode (str): The Sequence parallelism mode. Can only be choosed from ["split_gather", "ring", "all_to_all"]. Defaults to "split_gather".
         enable_sequence_overlap (bool): Whether to turn on sequence overlap in Shardformer. Defaults to False.
+        parallel_output (bool): Whether to keep the output parallel when enabling tensor parallelism. Default to True.
         num_microbatches (int, optional): Number of microbatches when using pipeline parallelism. Defaults to None.
         microbatch_size (int, optional): Microbatch size when using pipeline parallelism.
             Either ``num_microbatches`` or ``microbatch_size`` should be provided if using pipeline.
@@ -155,15 +159,21 @@ class MoeHybridParallelPlugin(HybridParallelPlugin):
         cpu_offload (bool, optional): Whether to open cpu_offload when using ZeRO. Defaults to False.
         communication_dtype (torch.dtype, optional): Communication dtype when using ZeRO. If not specified, the dtype of param will be used. Defaults to None.
         overlap_communication (bool, optional): Whether to overlap communication and computation when using ZeRO. Defaults to True.
-        use_ep_inside (bool, Optional): Whether to use ep inside dp (intra-node) for moe params.
+        custom_policy (Policy, optional): Custom policy for Shardformer. Defaults to None.
+        pp_style (str, optional): The style for pipeline parallelism. Defaults to '1f1b'.
+        num_model_chunks (int, optional): The number of model chunks for interleaved pipeline parallelism. Defaults to 1.
+        gradient_checkpoint_config (GradientCheckpointConfig, optional): Configuration for gradient checkpointing. Defaults to None.
+        enable_metadata_cache (bool, optional): Whether to enable metadata cache for pipeline parallelism. Defaults to True.
+        make_vocab_size_divisible_by (int, optional): it's used when padding the vocabulary size, to make it choose an faster kenel. Default to 64.
+        overlap_p2p (bool, optional): Whether to overlap the p2p communication in pipeline parallelism
     """
 
     def __init__(
         self,
+        tp_size: int,
         pp_size: int,
         ep_size: int,
-        tp_size: int = 1,
-        sp_size: int = 1,
+        sp_size: int = None,
         precision: str = "fp16",
         zero_stage: int = 0,
         enable_all_optimization: bool = False,
@@ -171,7 +181,9 @@ def __init__(
         enable_flash_attention: bool = False,
         enable_jit_fused: bool = False,
         enable_sequence_parallelism: bool = False,
+        sequence_parallelism_mode: str = None,
         enable_sequence_overlap: bool = False,
+        parallel_output: bool = True,
         num_microbatches: Optional[int] = None,
         microbatch_size: Optional[int] = None,
         initial_scale: float = 2**16,
@@ -191,27 +203,61 @@ def __init__(
         zero_bucket_size_in_m: int = 12,
         cpu_offload: bool = False,
         communication_dtype: Optional[torch.dtype] = None,
-        overlap_communication: bool = True,
-        use_ep_inside: bool = True,
+        overlap_communication: bool = False,
         custom_policy: Policy = None,
-        checkpoint_io: Optional[MoECheckpointIO] = None,
+        pp_style: str = "1f1b",
+        num_model_chunks: int = 1,
+        num_layers_per_stage: Optional[List[int]] = None,
+        gradient_checkpoint_config: Optional[GradientCheckpointConfig] = None,
+        enable_metadata_cache: bool = True,
+        make_vocab_size_divisible_by: int = 64,
+        moe_dp_outside: bool = True,
+        overlap_p2p: bool = True,
+        overlap_allgather: bool = False,
     ) -> None:
-        world_size = dist.get_world_size()
-        assert tp_size == 1, "Tensor parallel is not supported in MoE yet"
-        assert sp_size == 1 and enable_sequence_parallelism is False, "Sequence parallelism it not supported in MoE yet"
+        if overlap_communication or zero_stage == 2:
+            overlap_communication = False
+            zero_stage = 1
+            warnings.warn(
+                f"overlap_communication and zero_stage are set to False and 1 because "
+                f"ZeRO-2 or comm overlap cause program hang when some experts are not routed. "
+            )
 
         assert (
-            world_size % (tp_size * pp_size) == 0
-        ), f"world size {world_size} is not divisible by tp_size {tp_size} * pp_size {pp_size}"
-        assert (
-            world_size % (tp_size * pp_size * ep_size) == 0
-        ), f"world size {world_size} is not divisible by tp_size {tp_size} * pp_size {pp_size} * ep_size {ep_size}"
+            dist.get_world_size() % (tp_size * pp_size) == 0
+        ), f"World size {dist.get_world_size()} is not divisible by tp_size {tp_size} * pp_size {pp_size}"
+        if enable_sequence_parallelism:
+            self.sequence_parallelism_mode = (
+                sequence_parallelism_mode if sequence_parallelism_mode is not None else "all_to_all"
+            )
+            assert (
+                self.sequence_parallelism_mode in SUPPORT_SP_MODE
+            ), f"Sequence parallelism mode {self.sequence_parallelism_mode} is not in the supported list {SUPPORT_SP_MODE}"
+            if self.sequence_parallelism_mode in ["split_gather", "ring"]:
+                assert (
+                    tp_size > 1
+                ), f"Sequence parallelism mode {self.sequence_parallelism_mode} must be enabled when using tensor parallelism"
+                if sp_size != 1:
+                    warnings.warn(
+                        f"The sp_size will be the same as tp_size in sequence parallelism mode {self.sequence_parallelism_mode}, will ignore the given sequence parallelism size."
+                    )
+                self.sp_size = 1
+                self.dp_size = dist.get_world_size() // (tp_size * pp_size)
+            elif self.sequence_parallelism_mode in ["all_to_all"]:
+                self.sp_size = 1 if sp_size is None else sp_size
+                self.dp_size = dist.get_world_size() // (self.sp_size * pp_size * tp_size)
+        else:
+            self.dp_size = dist.get_world_size() // (tp_size * pp_size)
+            assert (
+                sp_size == 1 or sp_size is None
+            ), f"You should not set sp_size when sequence parallelism is not enabled."
+            self.sp_size = 1
 
-        self.dp_size = world_size // (tp_size * pp_size)
+        assert self.dp_size % ep_size == 0, f"dp_size should be divisible by ep_size, {self.dp_size=} {ep_size=}"
+        self.moe_dp_size = self.dp_size // ep_size
+        self.ep_size = ep_size
         self.tp_size = tp_size
         self.pp_size = pp_size
-        self.ep_size = ep_size
-        self.sp_size = sp_size
         self.precision = precision
         self.zero_stage = zero_stage
         self.cpu_offload = cpu_offload
@@ -220,61 +266,69 @@ def __init__(
         self.enable_flash_attention = enable_flash_attention
         self.enable_jit_fused = enable_jit_fused
         self.enable_sequence_parallelism = enable_sequence_parallelism
-        self.checkpoint_io = checkpoint_io
-
-        logger = get_dist_logger()
-
-        # NOTE: Two process meshes: global dp for non-moe param; dp + ep for moe param
-        # See https://hpc-ai.com/blog/enhanced-moe-parallelism-open-source-moe-model-training-can-be-9-times-more-efficient
-        # we change pg mesh to (pp, dp, tp) for better moe performance
-        assert (
-            self.ep_size <= self.dp_size
-        ), f"Not enough devices({self.dp_size}) for expert parallelism size({self.ep_size})."
-
-        self.moe_dp_size = self.dp_size // self.ep_size
-        self.use_ep_inside = use_ep_inside
-        if self.use_ep_inside:
-            logger.info(f"MoE Parallel use ep inside dp.", ranks=[0])
-            self.pp_axis, self.dp_axis, self.ep_axis, self.tp_axis = 0, 1, 2, 3
-            self.pg_mesh = ProcessGroupMesh(self.pp_size, self.moe_dp_size, ep_size, tp_size)
+        if moe_dp_outside:
+            self.moe_dp_axis, self.pp_axis, self.ep_axis, self.tp_axis, self.sp_axis = 0, 1, 2, 3, 4
+            self.pg_mesh = ProcessGroupMesh(self.moe_dp_size, self.pp_size, self.ep_size, self.tp_size, self.sp_size)
         else:
-            logger.info(f"MoE Parallel use ep outside dp.", ranks=[0])
-            warnings.warn("Using ep outside dp (cross-node) is strongly discouraged due to communication costs.")
-            self.pp_axis, self.dp_axis, self.ep_axis, self.tp_axis = 0, 2, 1, 3
-            self.pg_mesh = ProcessGroupMesh(self.pp_size, ep_size, self.moe_dp_size, tp_size)
-
-        self.moe_dp_group = self.pg_mesh.get_group_along_axis(self.dp_axis)
-        self.ep_group = self.pg_mesh.get_group_along_axis(self.ep_axis)
-        logger.info(f"Non-MoE Parameter Parallel: pp {self.pp_size}, dp {self.dp_size}, tp {tp_size}", ranks=[0])
-        logger.info(
-            f"MoE Parallel: pp {self.pp_size}, ep {ep_size}, moe dp {self.moe_dp_size}, tp {tp_size}", ranks=[0]
-        )
-
-        self.tp_group = self.pg_mesh.get_group_along_axis(
-            self.tp_axis
-        )  # TODO: support custom tp size for mixtral lm head
-        self.global_dp_group = self.pg_mesh.get_group_along_axis((self.dp_axis, self.ep_axis))
-        self.pp_group = self.pg_mesh.get_group_along_axis(self.pp_axis)
-        # TODO: Currently moe only support partially sequence parallel
-        self.sp_group = self.pg_mesh.get_group_along_axis(self.tp_axis)
+            self.pp_axis, self.moe_dp_axis, self.ep_axis, self.tp_axis, self.sp_axis = 0, 1, 2, 3, 4
+            self.pg_mesh = ProcessGroupMesh(self.pp_size, self.moe_dp_size, self.ep_size, self.tp_size, self.sp_size)
 
-        self.custom_policy = custom_policy
         self.stage_manager = None
         self.schedule = None
-
+        self.custom_policy = custom_policy
         assert zero_stage in (0, 1, 2)
         if self.pp_size > 1:
+            assert pp_style in ["1f1b", "interleaved"], "Unsupported pipeline parallelism style"
+            assert pp_style == "interleaved" or num_model_chunks == 1, "num_model_chunks must be 1 when using 1f1b"
             assert (
                 num_microbatches is not None or microbatch_size is not None
             ), "num_microbatches or microbatch_size must be specified when using pipeline parallelism"
-            assert self.zero_stage <= 1, "zero stage must be 0 or 1 when using pipeline parallelism"
-            self.stage_manager = PipelineStageManager(self.pg_mesh, self.pp_axis)
-            self.schedule = OneForwardOneBackwardSchedule(
-                self.stage_manager, num_microbatches=num_microbatches, microbatch_size=microbatch_size
+            assert (
+                self.zero_stage <= 1
+            ), "To avoid prohibitive gradient synchronization costs, zero stage must be 0 or 1 when using pipeline parallelism"
+            self.stage_manager = PipelineStageManager(
+                self.pg_mesh,
+                pipeline_axis=self.pp_axis,
+                enable_interleave=pp_style == "interleaved",
+                num_model_chunks=num_model_chunks,
+                num_layers_per_stage=num_layers_per_stage,
             )
 
+            if pp_style == "interleaved":
+                assert num_model_chunks > 1, "number of model chunks must be > 1 when using interleaved"
+                self.schedule = InterleavedSchedule(
+                    stage_manager=self.stage_manager,
+                    num_model_chunks=num_model_chunks,
+                    num_microbatch=num_microbatches,
+                    microbatch_size=microbatch_size,
+                    enable_metadata_cache=enable_metadata_cache,
+                    overlap_p2p=overlap_p2p,
+                )
+            elif pp_style == "1f1b":
+                self.schedule = OneForwardOneBackwardSchedule(
+                    stage_manager=self.stage_manager,
+                    num_microbatches=num_microbatches,
+                    microbatch_size=microbatch_size,
+                    enable_metadata_cache=enable_metadata_cache,
+                )
+            else:
+                raise NotImplementedError()
+
+        self.tp_group = self.pg_mesh.get_group_along_axis(self.tp_axis)
+        self.dp_group = self.pg_mesh.get_group_along_axis([self.moe_dp_axis, self.ep_axis])
+        self.pp_group = self.pg_mesh.get_group_along_axis(self.pp_axis)
+        self.moe_dp_group = self.pg_mesh.get_group_along_axis(self.moe_dp_axis)
+        self.ep_group = self.pg_mesh.get_group_along_axis(self.ep_axis)
+        if self.enable_sequence_parallelism and self.sequence_parallelism_mode in ["split_gather", "ring"]:
+            self.sp_group = self.pg_mesh.get_group_along_axis(self.tp_axis)
+        else:
+            self.sp_group = self.pg_mesh.get_group_along_axis(self.sp_axis)
+
         self.shard_config = ShardConfig(
             tensor_parallel_process_group=self.tp_group,
+            sequence_parallel_process_group=self.sp_group,
+            ep_group=self.ep_group,
+            moe_dp_group=self.moe_dp_group,
             pipeline_stage_manager=self.stage_manager,
             enable_tensor_parallelism=self.tp_size > 1,
             enable_all_optimization=self.enable_all_optimization,
@@ -282,8 +336,11 @@ def __init__(
             enable_flash_attention=self.enable_flash_attention,
             enable_jit_fused=self.enable_jit_fused,
             enable_sequence_parallelism=enable_sequence_parallelism,
+            sequence_parallelism_mode=sequence_parallelism_mode,
             enable_sequence_overlap=enable_sequence_overlap,
-            ep_group=self.ep_group,
+            parallel_output=parallel_output,
+            make_vocab_size_divisible_by=make_vocab_size_divisible_by,
+            gradient_checkpoint_config=gradient_checkpoint_config,
         )
         self.amp_config = dict(
             initial_scale=initial_scale,
@@ -310,77 +367,16 @@ def __init__(
             overlap_communication=overlap_communication,
             cpu_offload=cpu_offload,
             partition_grad=(self.zero_stage == 2),
+            forced_dtype=PRECISION_TORCH_TYPE[precision],
+            overlap_allgather=overlap_allgather,
         )
 
         self.max_norm = max_norm
 
-    def prepare_dataloader(
-        self, dataset, batch_size, shuffle=False, seed=1024, drop_last=False, pin_memory=False, num_workers=0, **kwargs
-    ):
-        r"""
-        Prepare a dataloader for distributed training. The dataloader will be wrapped by
-        `torch.utils.data.DataLoader` and `torch.utils.data.DistributedSampler`.
-
-
-        Args:
-            dataset (`torch.utils.data.Dataset`): The dataset to be loaded.
-            shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
-            seed (int, optional): Random worker seed for sampling, defaults to 1024.
-            add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True.
-            drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
-                is not divisible by the batch size. If False and the size of dataset is not divisible by
-                the batch size, then the last batch will be smaller, defaults to False.
-            pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False.
-            num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0.
-            kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
-                    `DataLoader <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html#DataLoader>`_.
-
-        Returns:
-            :class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
-        """
-        _kwargs = kwargs.copy()
-        sampler = DistributedSampler(
-            dataset,
-            num_replicas=self.dp_size,
-            rank=dist.get_rank(self.global_dp_group),
-            shuffle=shuffle,
-        )
-
-        # Deterministic dataloader
-        def seed_worker(worker_id):
-            worker_seed = seed
-            np.random.seed(worker_seed)
-            torch.manual_seed(worker_seed)
-            random.seed(worker_seed)
-
-        return DataLoader(
-            dataset,
-            batch_size=batch_size,
-            sampler=sampler,
-            worker_init_fn=seed_worker,
-            drop_last=drop_last,
-            pin_memory=pin_memory,
-            num_workers=num_workers,
-            **_kwargs,
-        )
-
     def get_checkpoint_io(self) -> MoECheckpointIO:
-        if self.checkpoint_io is None:
-            self.checkpoint_io = MoECheckpointIO(
-                self.global_dp_group, self.pp_group, self.tp_group, self.ep_group, self.moe_dp_group, self.zero_stage
-            )
-        else:
-            self.checkpoint_io = self.checkpoint_io(
-                self.global_dp_group,
-                self.pp_group,
-                self.tp_group,
-                ep_group=self.ep_group,
-                moe_dp_group=self.moe_dp_group,
-                zero_stage=self.zero_stage,
-            )
-            if hasattr(self.checkpoint_io, "moe_info"):
-                self.checkpoint_io.moe_info = self.moe_info
-        return self.checkpoint_io
+        return MoECheckpointIO(
+            self.dp_group, self.pp_group, self.tp_group, self.ep_group, self.moe_dp_group, self.zero_stage
+        )
 
     def configure(
         self,
@@ -391,13 +387,40 @@ def configure(
         lr_scheduler: Optional[LRScheduler] = None,
     ) -> Tuple[Module, OptimizerWrapper, Callable, DataLoader, LRScheduler]:
         param_info = get_param_info(optimizer)
+
+        # TODO: Support Galore + ZeRO
+        # Replace with distributed implementation if exists
+        optimizer = cast_to_distributed(optimizer)
+
         if not isinstance(model, ModelWrapper):
-            use_ddp = self.dp_size > 1 and self.pp_size == 1 and self.zero_stage == 0
+            use_ddp = (self.dp_size > 1 and self.pp_size == 1 and self.zero_stage == 0) or (
+                self.dp_size == 1
+                and self.pp_size == 1
+                and self.enable_sequence_parallelism
+                and self.sequence_parallelism_mode == "all_to_all"
+            )
+            if use_ddp:
+                warnings.warn(
+                    f"Will have to check all params are used in pytorch DDP since not all experts are always activated"
+                )
+                self.ddp_config["find_unused_parameters"] = True
+
+                if dist.get_process_group_ranks(self.dp_group) != dist.get_process_group_ranks(self.moe_dp_group):
+                    raise ValueError(
+                        f"if pytorch ddp is used, dp_group and moe_dp_group are expected to be the same since DDP can only reduce grad across a single group, but found dp_group {dist.get_process_group_ranks(self.dp_group)} and moe_dp_group {dist.get_process_group_ranks(self.moe_dp_group)}, you might want to use HybridParallelPlugin (i.e. set ep_size = 1) or set zero_stage > 0"
+                    )
+
+            # sync gradients across DP * SP ranks
+            if self.enable_sequence_parallelism and self.sequence_parallelism_mode == "all_to_all":
+                dp_group = self.pg_mesh.create_group_along_axis([self.moe_dp_axis, self.ep_axis, self.sp_axis])
+            else:
+                dp_group = self.dp_group
+
             model = HybridParallelModule(
                 module=model,
                 precision=self.precision,
                 shard_config=self.shard_config,
-                dp_group=self.global_dp_group,
+                dp_group=dp_group,
                 tp_group=self.tp_group,
                 sp_group=self.sp_group,
                 use_ddp=use_ddp,
@@ -405,7 +428,13 @@ def configure(
                 custom_policy=self.custom_policy,
             )
         if optimizer is not None and not isinstance(optimizer, OptimizerWrapper):
+            if self.ep_size > 1:
+                # if ep is enabled, the num of (moe) paramaters changed since they are sharded among ep groups
+                # but the optimizer is not aware of ep, so we need to update the optimizer
+                reinitialize_optimizer(optimizer, model)
+
             if self.zero_stage == 0:
+                is_zero = False
                 if self.precision in ["fp16", "bf16"]:
                     optimizer = HybridParallelAMPOptimizer(
                         optimizer,
@@ -418,20 +447,30 @@ def configure(
                     )
                 else:
                     optimizer = HybridParallelNaiveOptimizer(
-                        optimizer, model, use_pipeline=self.enable_pipeline_parallelism, param_info=param_info
+                        optimizer,
+                        model,
+                        use_pipeline=self.enable_pipeline_parallelism,
+                        param_info=param_info,
+                        max_norm=self.max_norm,
+                        pp_process_group=self.pp_group,
+                        tp_process_group=self.tp_group,
                     )
             else:
-                assert self.dp_size > 1, "Please use Zero when data parallel size is greater than 1."
+                if self.dp_size <= 1:
+                    warnings.warn(
+                        "Use Zero Optimizer when data parallel size is 1 may introduce unnecessary overhead. "
+                        "If you do not intend to use cpu_offload, please consider set zero_stage=0."
+                    )
                 assert self.precision != "fp32", "Please set precision to 'fp16' or 'bf16' when using ZeRO."
                 optimizer = MoeHybridParallelZeroOptimizer(
                     optimizer,
                     model,
                     use_pipeline=self.enable_pipeline_parallelism,
                     param_info=param_info,
-                    dp_process_group=self.global_dp_group,
+                    dp_process_group=dp_group,
                     tp_process_group=self.tp_group,
                     pp_process_group=self.pp_group,
-                    moe_extra_dp_process_group=self.moe_dp_group,
+                    moe_dp_group=self.moe_dp_group,
                     verbose=True,
                     clip_grad_norm=self.max_norm,
                     **self.zero_config,
@@ -440,4 +479,11 @@ def configure(
             # inject update_master_params
             model.update_master_params = MethodType(optimizer.update_master_params, model)
 
+            # Setup optimizers that require global states
+            optim = optimizer.optim
+            if isinstance(optim, DistributedOptim):
+                shard_to_param = optimizer.get_master_to_working_map() if is_zero else {}
+                padding_map = optimizer.get_param_padding_map() if is_zero else defaultdict(int)
+                optim.setup_distributed(self.tp_group, self.dp_group, shard_to_param, padding_map, is_zero)
+
         return model, optimizer, criterion, dataloader, lr_scheduler
diff --git a/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py b/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
index 61c9d1438cdf..0310df5489b0 100644
--- a/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
+++ b/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
@@ -195,6 +195,7 @@ def save_sharded_model(
         """
 
         assert isinstance(model, ModelWrapper), "Please boost the model before saving!"
+        model._force_wait_all_gather()
         model = model.unwrap()
 
         if os.path.isfile(checkpoint):
@@ -303,6 +304,7 @@ def load_sharded_model(self, model: ModelWrapper, checkpoint_index_file: Path, s
                                      This argument should be manually set to False since params on same device might be stored in different files.
         """
         assert isinstance(model, ModelWrapper), "Please boost the model before loading!"
+        model._force_wait_all_gather()
         model_before_wrapping = model  # backup for model before wrapping
         model = model.unwrap()
 
@@ -639,6 +641,7 @@ def save_unsharded_model(self, model: ModelWrapper, checkpoint: str, gather_dten
             logging.warning("Please avoid using unsharded checkpointing methods when dealing with large models!")
 
         assert isinstance(model, ModelWrapper), "Please boost the model before saving!"
+        model._force_wait_all_gather()
         model = model.unwrap()
 
         if self.dp_rank != 0:
@@ -679,6 +682,7 @@ def load_unsharded_model(self, model: ModelWrapper, checkpoint: str, strict: boo
             logging.warning("Please avoid using unsharded checkpointing methods when dealing with large models!")
 
         assert isinstance(model, ModelWrapper), "Please boost the model before loading!"
+        model._force_wait_all_gather()
         strict = False
         model_before_wrapping = model
         model = model.unwrap()
@@ -943,3 +947,17 @@ def shard_from_complete_optimizer_state(
                 state_[k] = v.detach().clone().to(device)
 
         return state_
+
+    def save_lora_as_pretrained(self, model, checkpoint, use_safetensors):
+        if os.path.isfile(checkpoint):
+            logging.error(f"Provided path ({checkpoint}) should be a directory, not a file")
+            return
+        from peft import PeftModel
+
+        assert isinstance(model, ModelWrapper), "Please boost the model before saving!"
+        model._force_wait_all_gather()
+        peft_model = model.unwrap()
+        assert isinstance(
+            peft_model, PeftModel
+        ), "The model doesn't have lora adapters, please enable lora before saving."
+        return peft_model.save_pretrained(checkpoint, safe_serialization=use_safetensors)
diff --git a/colossalai/checkpoint_io/moe_checkpoint.py b/colossalai/checkpoint_io/moe_checkpoint.py
index a0b62500807f..9181956b7f60 100644
--- a/colossalai/checkpoint_io/moe_checkpoint.py
+++ b/colossalai/checkpoint_io/moe_checkpoint.py
@@ -151,13 +151,10 @@ def save_sharded_model(
 
         # ep_rank 0 saves all the parameters and buffers.
         # other ep_ranks save only experts
-        ep_param_pattern = "experts." if self.ep_rank != 0 else None
 
         # Then collect the sharded parameters & buffers along tp_group.
         # Only devices with tp_rank == 0 are responsible for model saving.
-        state_dict_shard = MoECheckpointIO._model_sharder(
-            model, size_per_shard=size_per_shard, param_name_pattern=ep_param_pattern
-        )
+        state_dict_shard = MoECheckpointIO._model_sharder(model, size_per_shard=size_per_shard)
         weights_name, save_index_file = get_model_base_filenames(prefix, use_safetensors)
         index_file = CheckpointIndexFile(checkpoint)
         control_saving = self.tp_rank == 0
diff --git a/colossalai/cluster/dist_coordinator.py b/colossalai/cluster/dist_coordinator.py
index 98191747e5b3..14a8eabb42b1 100644
--- a/colossalai/cluster/dist_coordinator.py
+++ b/colossalai/cluster/dist_coordinator.py
@@ -44,7 +44,7 @@ def __init__(self):
         self._rank = dist.get_rank()
         self._world_size = dist.get_world_size()
         # this is often passed by launchers such as torchrun
-        self._local_rank = os.environ.get("LOCAL_RANK", -1)
+        self._local_rank = int(os.environ.get("LOCAL_RANK", -1))
 
     @property
     def rank(self) -> int:
diff --git a/colossalai/cluster/process_group_mesh.py b/colossalai/cluster/process_group_mesh.py
index 1319a4529093..dc96708f0270 100644
--- a/colossalai/cluster/process_group_mesh.py
+++ b/colossalai/cluster/process_group_mesh.py
@@ -7,6 +7,7 @@
 import numpy as np
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
+from torch.distributed.distributed_c10d import GroupMember
 
 
 def prod(nums: List[int]) -> int:
@@ -47,7 +48,7 @@ def __init__(self, *size: int) -> None:
         self._shape = size
         self._rank = dist.get_rank()
         self._coord = ProcessGroupMesh.unravel(self._rank, self._shape)
-        self._ranks_to_group: Dict[Tuple[int, ...], ProcessGroup] = {}
+        self._ranks_to_group: Dict[Tuple[int, ...], Union[ProcessGroup, GroupMember.NON_GROUP_MEMBER]] = {}
         self._group_to_ranks: Dict[ProcessGroup, Tuple[int, ...]] = {}
 
     def destroy_mesh_process_groups(self):
@@ -136,7 +137,7 @@ def ravel(coord: Tuple[int, ...], shape: Tuple[int, ...], mode: str = "raise") -
         assert mode in ["raise", "wrap", "clip"]
         return int(np.ravel_multi_index(coord, shape, mode))
 
-    def get_group(self, ranks_in_group: List[int], backend: Optional[str] = None) -> ProcessGroup:
+    def _get_group(self, ranks_in_group: List[int], backend: Optional[str] = None) -> ProcessGroup:
         """Get the process group with the given ranks. It the process group doesn't exist, it will be created.
 
         Args:
@@ -147,10 +148,11 @@ def get_group(self, ranks_in_group: List[int], backend: Optional[str] = None) ->
             ProcessGroup: The process group with the given ranks.
         """
         ranks_in_group = sorted(ranks_in_group)
-        if tuple(ranks_in_group) not in self._group_to_ranks:
+        if tuple(ranks_in_group) not in self._ranks_to_group:
             group = dist.new_group(ranks_in_group, backend=backend)
             self._ranks_to_group[tuple(ranks_in_group)] = group
-            self._group_to_ranks[group] = tuple(ranks_in_group)
+            if group is not GroupMember.NON_GROUP_MEMBER:
+                self._group_to_ranks[group] = tuple(ranks_in_group)
         return self._ranks_to_group[tuple(ranks_in_group)]
 
     def get_ranks_in_group(self, group: ProcessGroup) -> List[int]:
@@ -238,7 +240,7 @@ def create_group_along_axis(
         for base_coord in itertools.product(*[range(s) for s in reduced_shape]):
             coords_in_group = ProcessGroupMesh.get_coords_along_axis(base_coord, axis, indices_at_axis)
             ranks_in_group = tuple([ProcessGroupMesh.ravel(coord, self._shape) for coord in coords_in_group])
-            group = self.get_group(ranks_in_group, backend=backend)
+            group = self._get_group(ranks_in_group, backend=backend)
             if self._rank in ranks_in_group:
                 target_group = group
         return target_group
diff --git a/colossalai/inference/README.md b/colossalai/inference/README.md
index 0a9b5293d4a2..76813a4a3495 100644
--- a/colossalai/inference/README.md
+++ b/colossalai/inference/README.md
@@ -18,7 +18,7 @@
 
 
 ## 📌 Introduction
-ColossalAI-Inference is a module which offers acceleration to the inference execution of Transformers models, especially LLMs. In ColossalAI-Inference, we leverage high-performance kernels, KV cache, paged attention, continous batching and other techniques to accelerate the inference of LLMs. We also provide simple and unified APIs for the sake of user-friendliness. [[blog]](https://hpc-ai.com/blog/colossal-inference)
+ColossalAI-Inference is a module which offers acceleration to the inference execution of Transformers models, especially LLMs and DiT Diffusion Models. In ColossalAI-Inference, we leverage high-performance kernels, KV cache, paged attention, continous batching and other techniques to accelerate the inference of LLMs. We also provide simple and unified APIs for the sake of user-friendliness. [[blog]](https://hpc-ai.com/blog/colossal-inference)
 
 <p align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/inference/colossal-inference-v1-1.png" width=1000/>
@@ -310,4 +310,14 @@ If you wish to cite relevant research papars, you can find the reference below.
   journal={arXiv},
   year={2023}
 }
+
+# Distrifusion
+@InProceedings{Li_2024_CVPR,
+    author={Li, Muyang and Cai, Tianle and Cao, Jiaxin and Zhang, Qinsheng and Cai, Han and Bai, Junjie and Jia, Yangqing and Li, Kai and Han, Song},
+    title={DistriFusion: Distributed Parallel Inference for High-Resolution Diffusion Models},
+    booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+    month={June},
+    year={2024},
+    pages={7183-7193}
+}
 ```
diff --git a/colossalai/inference/config.py b/colossalai/inference/config.py
index e114e8a61ac4..072ddbcfd298 100644
--- a/colossalai/inference/config.py
+++ b/colossalai/inference/config.py
@@ -5,7 +5,7 @@
 import logging
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, fields
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union
 
 import torch
 from transformers.generation import GenerationConfig
@@ -186,6 +186,7 @@ class InferenceConfig(RPC_PARAM):
         enable_streamingllm(bool): Whether to use StreamingLLM, the relevant algorithms refer to the paper at https://arxiv.org/pdf/2309.17453 for implementation.
         start_token_size(int): The size of the start tokens, when using StreamingLLM.
         generated_token_size(int): The size of the generated tokens, When using StreamingLLM.
+        patched_parallelism_size(int): Patched Parallelism Size, When using Distrifusion
     """
 
     # NOTE: arrange configs according to their importance and frequency of usage
@@ -245,6 +246,11 @@ class InferenceConfig(RPC_PARAM):
     start_token_size: int = 4
     generated_token_size: int = 512
 
+    # Acceleration for Diffusion Model(PipeFusion or Distrifusion)
+    patched_parallelism_size: int = 1  # for distrifusion
+    # pipeFusion_m_size: int = 1  # for pipefusion
+    # pipeFusion_n_size: int = 1  # for pipefusion
+
     def __post_init__(self):
         self.max_context_len_to_capture = self.max_input_len + self.max_output_len
         self._verify_config()
@@ -288,6 +294,14 @@ def _verify_config(self) -> None:
         # Thereafter, we swap out tokens in units of blocks, and always swapping out the second block when the generated tokens exceeded the limit.
         self.start_token_size = self.block_size
 
+        # check Distrifusion
+        # TODO(@lry89757) need more detailed check
+        if self.patched_parallelism_size > 1:
+            # self.use_patched_parallelism = True
+            self.tp_size = (
+                self.patched_parallelism_size
+            )  # this is not a real tp, because some annoying check, so we have to set this to patched_parallelism_size
+
         # check prompt template
         if self.prompt_template is None:
             return
@@ -324,6 +338,7 @@ def to_model_shard_inference_config(self) -> "ModelShardInferenceConfig":
             use_cuda_kernel=self.use_cuda_kernel,
             use_spec_dec=self.use_spec_dec,
             use_flash_attn=use_flash_attn,
+            patched_parallelism_size=self.patched_parallelism_size,
         )
         return model_inference_config
 
@@ -396,3 +411,50 @@ class ModelShardInferenceConfig:
     use_cuda_kernel: bool = False
     use_spec_dec: bool = False
     use_flash_attn: bool = False
+    patched_parallelism_size: int = 1  # for diffusion model, Distrifusion Technique
+
+
+@dataclass
+class DiffusionGenerationConfig:
+    """
+    Param for diffusion model forward
+    """
+
+    prompt_2: Optional[Union[str, List[str]]] = None
+    prompt_3: Optional[Union[str, List[str]]] = None
+    height: Optional[int] = None
+    width: Optional[int] = None
+    num_inference_steps: int = None
+    timesteps: List[int] = None
+    guidance_scale: float = None
+    negative_prompt: Optional[Union[str, List[str]]] = (
+        None  # NOTE(@lry89757) in pixart default to "", in sd3 default to None
+    )
+    negative_prompt_2: Optional[Union[str, List[str]]] = None
+    negative_prompt_3: Optional[Union[str, List[str]]] = None
+    num_images_per_prompt: Optional[int] = None
+    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None
+    latents: Optional[torch.FloatTensor] = None
+    prompt_embeds: Optional[torch.FloatTensor] = None
+    negative_prompt_embeds: Optional[torch.FloatTensor] = None
+    pooled_prompt_embeds: Optional[torch.FloatTensor] = None
+    negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None
+    output_type: Optional[str] = None  # "pil"
+    return_dict: bool = None
+    joint_attention_kwargs: Optional[Dict[str, Any]] = None
+    clip_skip: Optional[int] = None
+    callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None
+    callback_on_step_end_tensor_inputs: List[str] = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        # NOTE(@lry89757) Only return the dict that not the default value None
+        result = {}
+        for field in fields(self):
+            value = getattr(self, field.name)
+            if value is not None:
+                result[field.name] = value
+        return result
+
+    @classmethod
+    def from_kwargs(cls, **kwargs) -> "DiffusionGenerationConfig":
+        return cls(**kwargs)
diff --git a/colossalai/inference/core/base_engine.py b/colossalai/inference/core/base_engine.py
new file mode 100644
index 000000000000..392dd2990abd
--- /dev/null
+++ b/colossalai/inference/core/base_engine.py
@@ -0,0 +1,90 @@
+from abc import ABC, abstractmethod
+
+import torch
+import torch.nn as nn
+
+from colossalai.cluster import ProcessGroupMesh
+from colossalai.inference.config import ModelShardInferenceConfig
+from colossalai.pipeline.stage_manager import PipelineStageManager
+from colossalai.shardformer import ShardConfig, ShardFormer
+from colossalai.shardformer.policies.base_policy import Policy
+
+
+class BaseEngine(ABC):
+    @abstractmethod
+    def __init__(self, model_or_path, inference_config=None, verbose=False, model_policy=None):
+        pass
+
+    @abstractmethod
+    def init_model(self, model_or_path, model_policy=None, model_shard_infer_config=None):
+        """
+        Init Model for Engine
+        """
+
+    @abstractmethod
+    def generate(self, request_ids=None, prompts=None, generation_config=None, **kwargs):
+        """
+        Generate ouptput for coming requests
+        """
+
+    @abstractmethod
+    def add_request(self, prompts, request_ids=None, **kwargs):
+        """
+        Add new request to Engine
+        """
+
+    @abstractmethod
+    def step(self):
+        """
+        Perform one new step forward
+        """
+
+    @abstractmethod
+    def _verify_args(self):
+        """
+        Verify the parameters and members of class
+        """
+
+    @torch.inference_mode()
+    def capture_model(self):
+        """
+        Use cuda graph to capture model
+        """
+        return NotImplementedError("This method should be implemented by subclasses")
+
+    def _shardformer(
+        self,
+        model: nn.Module,
+        model_policy: Policy,
+        model_shard_infer_config: ModelShardInferenceConfig = None,
+        stage_manager: PipelineStageManager = None,
+        tp_group: ProcessGroupMesh = None,
+        **kwargs,
+    ) -> nn.Module:
+        """
+        Initialize ShardConfig and replace the model with shardformer.
+
+        Args:
+            model (nn.Module): Path or nn.Module of this model.
+            model_policy (Policy): The policy to shardformer model which is determined by the model type.
+            stage_manager (PipelineStageManager, optional): Used to manage pipeline stages. Defaults to None.
+            tp_group (ProcessGroupMesh, optional): Used to manage the process TP group mesh. Defaults to None.
+
+        Returns:
+            nn.Module: The model optimized by Shardformer.
+        """
+
+        shardconfig = ShardConfig(
+            tensor_parallel_process_group=tp_group,
+            pipeline_stage_manager=stage_manager,
+            enable_tensor_parallelism=(self.inference_config.tp_size > 1),
+            enable_fused_normalization=False,
+            enable_all_optimization=False,
+            enable_flash_attention=False,
+            enable_jit_fused=False,
+            enable_sequence_parallelism=False,
+            extra_kwargs={"model_shard_infer_config": model_shard_infer_config, **kwargs},
+        )
+        shardformer = ShardFormer(shard_config=shardconfig)
+        shard_model, _ = shardformer.optimize(model, model_policy)
+        return shard_model
diff --git a/colossalai/inference/core/diffusion_engine.py b/colossalai/inference/core/diffusion_engine.py
new file mode 100644
index 000000000000..8bed508cba55
--- /dev/null
+++ b/colossalai/inference/core/diffusion_engine.py
@@ -0,0 +1,200 @@
+from itertools import count
+from typing import List, Tuple, Type, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn as nn
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from torch import distributed as dist
+
+from colossalai.accelerator import get_accelerator
+from colossalai.cluster import ProcessGroupMesh
+from colossalai.inference.config import DiffusionGenerationConfig, InferenceConfig, ModelShardInferenceConfig
+from colossalai.inference.modeling.layers.diffusion import DiffusionPipe
+from colossalai.inference.modeling.policy import model_policy_map
+from colossalai.inference.struct import DiffusionSequence
+from colossalai.inference.utils import get_model_size, get_model_type
+from colossalai.logging import get_dist_logger
+from colossalai.shardformer.policies.base_policy import Policy
+
+from .base_engine import BaseEngine
+from .request_handler import NaiveRequestHandler
+
+PP_AXIS, TP_AXIS = 0, 1
+
+
+class DiffusionEngine(BaseEngine):
+    def __init__(
+        self,
+        model_or_path: DiffusionPipeline | str,
+        inference_config: InferenceConfig = None,
+        verbose: bool = False,
+        model_policy: Policy | type[Policy] = None,
+    ) -> None:
+        self.inference_config = inference_config
+        self.dtype = inference_config.dtype
+        self.high_precision = inference_config.high_precision
+
+        self.verbose = verbose
+        self.logger = get_dist_logger(__name__)
+        self.model_shard_infer_config = inference_config.to_model_shard_inference_config()
+
+        self.model_type = get_model_type(model_or_path=model_or_path)
+
+        self.init_model(model_or_path, model_policy, self.model_shard_infer_config)
+
+        self.request_handler = NaiveRequestHandler()
+
+        self.counter = count()
+
+        self._verify_args()
+
+    def _verify_args(self) -> None:
+        assert isinstance(self.model, DiffusionPipe), "model must be DiffusionPipe"
+
+    def init_model(
+        self,
+        model_or_path: Union[str, nn.Module, DiffusionPipeline],
+        model_policy: Union[Policy, Type[Policy]] = None,
+        model_shard_infer_config: ModelShardInferenceConfig = None,
+    ):
+        """
+        Shard model or/and Load weight
+
+        Args:
+            model_or_path Union[nn.Module, str]: path to the checkpoint or model of transformer format.
+            model_policy (Policy): the policy to replace the model.
+            model_inference_config: the configuration for modeling initialization when inference.
+            model_shard_infer_config (ModelShardInferenceConfig): the configuration for init of module when inference.
+        """
+        if isinstance(model_or_path, str):
+            model = DiffusionPipeline.from_pretrained(model_or_path, torch_dtype=self.dtype)
+            policy_map_key = model.__class__.__name__
+            model = DiffusionPipe(model)
+        elif isinstance(model_or_path, DiffusionPipeline):
+            policy_map_key = model_or_path.__class__.__name__
+            model = DiffusionPipe(model_or_path)
+        else:
+            self.logger.error(f"model_or_path support only str or DiffusionPipeline currently!")
+
+        torch.cuda.empty_cache()
+        init_gpu_memory = torch.cuda.mem_get_info()[0]
+
+        self.device = get_accelerator().get_current_device()
+        if self.verbose:
+            self.logger.info(f"the device is {self.device}")
+
+        if self.verbose:
+            self.logger.info(
+                f"Before the shard, Rank: [{dist.get_rank()}], model size: {get_model_size(model)} GB, model's device is: {model.device}"
+            )
+
+        if model_policy is None:
+            model_policy = model_policy_map.get(policy_map_key)
+
+        if not isinstance(model_policy, Policy):
+            try:
+                model_policy = model_policy()
+            except Exception as e:
+                raise ValueError(f"Unable to instantiate model policy: {e}")
+
+        assert isinstance(model_policy, Policy), f"Invalid type of model policy: {type(model_policy)}"
+        pg_mesh = ProcessGroupMesh(self.inference_config.pp_size, self.inference_config.tp_size)
+        tp_group = pg_mesh.get_group_along_axis(TP_AXIS)
+
+        self.model = self._shardformer(
+            model,
+            model_policy,
+            model_shard_infer_config,
+            None,
+            tp_group=tp_group,
+        )
+
+        self.model = model.to(self.device)
+
+        if self.verbose:
+            self.logger.info(
+                f"After the shard, Rank: [{dist.get_rank()}], model size: {get_model_size(self.model)} GB, model's device is: {model.device}"
+            )
+
+        free_gpu_memory, _ = torch.cuda.mem_get_info()
+        peak_memory = init_gpu_memory - free_gpu_memory
+        if self.verbose:
+            self.logger.info(
+                f"Rank [{dist.get_rank()}], Model Weight Max Occupy {peak_memory / (1024 ** 3)} GB, Model size: {get_model_size(self.model)} GB"
+            )
+
+    def generate(
+        self,
+        request_ids: Union[List[int], int] = None,
+        prompts: Union[List[str], str] = None,
+        generation_config: DiffusionGenerationConfig = None,
+        **kwargs,
+    ) -> Union[List[Union[str, List[PIL.Image.Image], np.ndarray]], Tuple[List[str], List[List[int]]]]:
+        """ """
+        gen_config_dict = generation_config.to_dict() if generation_config is not None else {}
+        prompts = [prompts] if isinstance(prompts, str) else prompts
+        request_ids = [request_ids] if isinstance(request_ids, int) else request_ids
+
+        with torch.inference_mode():
+            if prompts is not None:
+                self.add_request(
+                    request_ids=request_ids,
+                    prompts=prompts,
+                    **gen_config_dict,
+                    **kwargs,
+                )
+
+            output_reqs_list = []
+
+            # intuition: If user provide a generation config, we should replace the existing one.
+            if generation_config is not None:
+                self.generation_config = generation_config
+                self.generation_config_dict = gen_config_dict
+
+            while self.request_handler.check_unfinished_reqs():
+                output_reqs_list += self.step()
+
+            return output_reqs_list
+
+    def add_request(
+        self,
+        prompts: Union[List[str], str],
+        request_ids: Union[List[int], int] = None,
+        **kwargs,
+    ):
+        if request_ids is not None and not isinstance(request_ids, list):
+            request_ids = [request_ids]
+
+        if not isinstance(prompts, list):
+            prompts = [prompts]
+
+        generation_config = DiffusionGenerationConfig.from_kwargs(**kwargs)
+        prompts_num = len(prompts)
+        for i in range(prompts_num):
+            if request_ids:
+                assert isinstance(
+                    request_ids[0], int
+                ), f"The request_id type must be int, but got {type(request_ids[0])}"
+                assert len(request_ids) == prompts_num
+                request_id = request_ids[i]
+            else:
+                request_id = next(self.counter)
+
+            seq = DiffusionSequence(request_id=request_id, prompt=prompts[i], generation_config=generation_config)
+
+            self.request_handler.add_sequence(seq)
+
+    def step(self) -> List[PIL.Image.Image]:
+        """
+        In each step, do the follows:
+            1. Run RequestHandler.schedule() and get the batch used for inference.
+            2. run forward to get List[Image]
+        Returns:
+            List[PIL.Image.Image]: Image Generated by one step.
+        """
+
+        input = self.request_handler.schedule()
+        ret = self.model(prompt=input.prompt, **input.generation_config.to_dict())
+        return ret
diff --git a/colossalai/inference/core/engine.py b/colossalai/inference/core/engine.py
index 8f8aef65e59c..5c9bdc3214e9 100644
--- a/colossalai/inference/core/engine.py
+++ b/colossalai/inference/core/engine.py
@@ -1,57 +1,24 @@
-import time
-from itertools import count
-from typing import Dict, List, Optional, Tuple, Type, Union
+from typing import List, Tuple, Type, Union
 
 import numpy as np
-import torch
+import PIL.Image
 import torch.nn as nn
-from torch import distributed as dist
-from transformers import (
-    AutoConfig,
-    AutoModelForCausalLM,
-    GenerationConfig,
-    PreTrainedTokenizer,
-    PreTrainedTokenizerFast,
-)
-from transformers.models.llama.modeling_llama import LlamaForCausalLM
+from diffusers import DiffusionPipeline
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
-from colossalai.accelerator import get_accelerator
-from colossalai.cluster import ProcessGroupMesh
-from colossalai.inference.batch_bucket import BatchBucket
-from colossalai.inference.config import InferenceConfig, InputMetaData, ModelShardInferenceConfig
-from colossalai.inference.graph_runner import CUDAGraphRunner
-from colossalai.inference.modeling.policy import model_policy_map
-from colossalai.inference.sampler import search_tokens
-from colossalai.inference.spec import Drafter, GlideInput
-from colossalai.inference.struct import Sequence
-from colossalai.inference.utils import get_model_size, has_index_file
-from colossalai.interface import ModelWrapper
-from colossalai.lazy import LazyInitContext
-from colossalai.logging import get_dist_logger
-from colossalai.pipeline.stage_manager import PipelineStageManager
-from colossalai.shardformer import ShardConfig, ShardFormer
+from colossalai.inference.config import InferenceConfig
+from colossalai.inference.utils import ModelType, get_model_type
 from colossalai.shardformer.policies.base_policy import Policy
 
-from .request_handler import RequestHandler
-
 __all__ = ["InferenceEngine"]
 
-PP_AXIS, TP_AXIS = 0, 1
-
-_supported_models = {
-    "LlamaForCausalLM": LlamaForCausalLM,
-    "BaichuanForCausalLM": AutoModelForCausalLM,
-}
-
-_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [8 * i for i in range(1, 33)]
-
 
 class InferenceEngine:
     """
     InferenceEngine which manages the inference process..
 
     Args:
-        model_or_path (nn.Module or str): Path or nn.Module of this model.
+        model_or_path (nn.Module or DiffusionPipeline or str): Path or nn.Module or DiffusionPipeline of this model.
         tokenizer Optional[(Union[PreTrainedTokenizer, PreTrainedTokenizerFast])]: Path of the tokenizer to use.
         inference_config (Optional[InferenceConfig], optional): Store the configuration information related to inference.
         verbose (bool): Determine whether or not to log the generation process.
@@ -60,567 +27,68 @@ class InferenceEngine:
 
     def __init__(
         self,
-        model_or_path: Union[nn.Module, str],
-        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
-        inference_config: InferenceConfig,
+        model_or_path: Union[nn.Module, str, DiffusionPipeline],
+        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast] = None,
+        inference_config: InferenceConfig = None,
         verbose: bool = False,
         model_policy: Union[Policy, Type[Policy]] = None,
     ) -> None:
-        self.inference_config = inference_config
-        self.dtype = inference_config.dtype
-        self.high_precision = inference_config.high_precision
-
-        self.verbose = verbose
-        self.logger = get_dist_logger(__name__)
-        self.model_shard_infer_config = inference_config.to_model_shard_inference_config()
-
-        self.init_model(model_or_path, model_policy, self.model_shard_infer_config)
-
-        self.generation_config = inference_config.to_generation_config(self.model_config)
-        self.generation_config_dict = self.generation_config.to_dict()
-
-        self.tokenizer = tokenizer
-        self.tokenizer.pad_token = self.tokenizer.eos_token
-
-        self.request_handler = RequestHandler(self.inference_config, self.model_config)
-        self.k_cache, self.v_cache = self.request_handler.get_kvcache()
-        # DISCUSS maybe move this into batch info?
-
-        self.counter = count()
-
-        self.use_cuda_graph = self.inference_config.use_cuda_graph
-        if self.use_cuda_graph:
-            self.graph_runners: Dict[int, CUDAGraphRunner] = {}
-            self.graph_memory_pool = None  # Set during graph capture.
-            if verbose:
-                self.logger.info("Colossal AI CUDA Graph Capture on")
-
-            self.capture_model(self.k_cache, self.v_cache)
-
-        # Model and relatable attrs of speculative decoding will be set by `enable_spec_dec`
-        self.use_spec_dec = self.inference_config.use_spec_dec
-
-        self.drafter_model = None
-        self.drafter = None
-        self.use_glide = False
-        self.n_spec_tokens = self.inference_config.max_n_spec_tokens
-
-        self._verify_args()
-
-    def init_model(
-        self,
-        model_or_path: Union[nn.Module, str],
-        model_policy: Union[Policy, Type[Policy]] = None,
-        model_shard_infer_config: ModelShardInferenceConfig = None,
-    ):
-        """
-        Shard model or/and Load weight
-
-        Args:
-            model_or_path Union[nn.Module, str]: path to the checkpoint or model of transformer format.
-            model_policy (Policy): the policy to replace the model.
-            model_inference_config: the configuration for modeling initialization when inference.
-            model_shard_infer_config (ModelShardInferenceConfig): the configuration for init of module when inference.
-        """
-        pretrained_path = None
-        if isinstance(model_or_path, str):
-            import colossalai.interface.pretrained as pretrained_utils
-
-            try:
-                hf_config = AutoConfig.from_pretrained(model_or_path, trust_remote_code=True, torch_dtype=self.dtype)
-                arch = getattr(hf_config, "architectures")[0]
-                if arch in _supported_models.keys():
-                    if arch is "BaichuanForCausalLM":
-                        self.logger.warning(
-                            "Attention ! We use lazy init by default, which could be faster for model loading. For baichuan model, the output maybe have a slight difference with transformers"
-                        )
-                    ctx = LazyInitContext(default_device="cuda")
-                    with ctx:
-                        model = _supported_models[arch].from_pretrained(
-                            model_or_path, trust_remote_code=True, torch_dtype=self.dtype
-                        )
-                    pretrained_path = pretrained_utils.get_pretrained_path(model)
-                else:
-                    # TODO(char-1ee): if the model not supported, use transformers APIs to load and generate
-                    raise ValueError(f"Model {arch} is not supported.")
-
-            except Exception as e:
-                self.logger.error(
-                    f"An exception occurred during loading model: {e}, model should be loaded by transformers\n"
-                )
-        else:
-            model = model_or_path
-
-        self.model_config = model.config
-
-        torch.cuda.empty_cache()
-        init_gpu_memory = torch.cuda.mem_get_info()[0]
-
-        self.device = get_accelerator().get_current_device()
-        if self.verbose:
-            self.logger.info(f"the device is {self.device}")
-
-        model = model.to(self.dtype).eval()
-
-        if self.verbose:
-            self.logger.info(
-                f"Before the shard, Rank: [{dist.get_rank()}], model size: {get_model_size(model)} GB, model's device is: {model.device}"
+        self.__dict__["_initialized"] = False  # use __dict__ directly to avoid calling __setattr__
+        self.model_type = get_model_type(model_or_path=model_or_path)
+        self.engine = None
+        if self.model_type == ModelType.LLM:
+            from .llm_engine import LLMEngine
+
+            self.engine = LLMEngine(
+                model_or_path=model_or_path,
+                tokenizer=tokenizer,
+                inference_config=inference_config,
+                verbose=verbose,
+                model_policy=model_policy,
             )
-
-        if model_policy is None:
-            prefix = "nopadding" if not self.inference_config.pad_input else "padding"
-            model_policy_key = f"{prefix}_{getattr(self.model_config, 'model_type', None)}"
-            model_policy = model_policy_map.get(model_policy_key)
-
-        if not isinstance(model_policy, Policy):
-            try:
-                model_policy = model_policy()
-            except Exception as e:
-                raise ValueError(f"Unable to instantiate model policy: {e}")
-
-        assert isinstance(model_policy, Policy), f"Invalid type of model policy: {type(model_policy)}"
-        pg_mesh = ProcessGroupMesh(self.inference_config.pp_size, self.inference_config.tp_size)
-        tp_group = pg_mesh.get_group_along_axis(TP_AXIS)
-
-        self.model = self._shardformer(
-            model,
-            model_policy,
-            model_shard_infer_config,
-            None,
-            tp_group=tp_group,
-        )
-
-        self.model = ModelWrapper(model).to(self.device)
-
-        if self.verbose:
-            self.logger.info(
-                f"After the shard, Rank: [{dist.get_rank()}], model size: {get_model_size(self.model)} GB, model's device is: {model.device}"
+        elif self.model_type == ModelType.DIFFUSION_MODEL:
+            from .diffusion_engine import DiffusionEngine
+
+            self.engine = DiffusionEngine(
+                model_or_path=model_or_path,
+                inference_config=inference_config,
+                verbose=verbose,
+                model_policy=model_policy,
             )
+        elif self.model_type == ModelType.UNKNOWN:
+            self.logger.error(f"Model Type either Difffusion or LLM!")
 
-        if pretrained_path:
-            from colossalai.inference.core.plugin import InferCheckpoint_io
-
-            cpt_io = InferCheckpoint_io()
-            if_has_index_file, model_index_file = has_index_file(pretrained_path)
-            assert if_has_index_file, "the model path is invalid"
-            cpt_io.load_model(self.model, model_index_file)
-
-        free_gpu_memory, _ = torch.cuda.mem_get_info()
-        peak_memory = init_gpu_memory - free_gpu_memory
-        if self.verbose:
-            self.logger.info(
-                f"Rank [{dist.get_rank()}], Model Weight Max Occupy {peak_memory / (1024 ** 3)} GB, Model size: {get_model_size(self.model)} GB"
-            )
-
-    @torch.inference_mode()
-    def capture_model(self, k_cache: List[torch.Tensor], v_cache: List[torch.Tensor]):
-        assert self.use_cuda_graph, "please turn on the cuda graph"
-
-        if self.verbose:
-            self.logger.info("Colossal AI CUDA Graph Capture begin")
-
-        t_capture_begin = time.perf_counter()
-
-        block_size = self.inference_config.block_size
-        head_dim = self.model_config.hidden_size // self.model_config.num_attention_heads
-
-        # Prepare dummy inputs. These will be reused for all batch sizes.
-        max_batch_size = max(_BATCH_SIZES_TO_CAPTURE)
-        max_context_len_to_capture = self.inference_config.max_context_len_to_capture
-        max_num_blocks = (max_context_len_to_capture + block_size - 1) // block_size
-        input_tokens_ids = torch.zeros(max_batch_size, dtype=torch.long).cuda()
-        # self.graph_block_tables = np.zeros((max(_BATCH_SIZES_TO_CAPTURE), max_num_blocks), dtype=np.int32)
-        self.graph_block_tables = np.full((max(_BATCH_SIZES_TO_CAPTURE), max_num_blocks), -1, dtype=np.int32)
-        self.graph_block_tables[:, 0] = np.arange(max_num_blocks, max_num_blocks + max(_BATCH_SIZES_TO_CAPTURE))
-        self.graph_block_tables[0, :] = np.arange(
-            0, max_num_blocks
-        )  # NOTE this is a hack to insure cuda grpah could capture the fixed cuda kernel grid in flash decoding, to make the first seqlen as the max_seq_len
-        block_tables = torch.from_numpy(self.graph_block_tables).cuda()
-        output_tensor = torch.zeros(
-            (max_batch_size, self.model_config.num_attention_heads * head_dim), dtype=self.dtype, device=self.device
-        )
-        fd_inter_tensor = self.request_handler.running_bb.fd_inter_tensor
-
-        max_num_seqs = self.inference_config.max_batch_size
-        batch_size_capture_list = [bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= max_num_seqs]
-        sequence_lengths = torch.ones(max_batch_size, dtype=torch.int).cuda()
-        # NOTE this is a hack to insure cuda grpah could capture the fixed cuda kernel grid in flash decoding, to make the first seqlen as the max_seq_len
-        sequence_lengths[0] = torch.tensor(
-            self.inference_config.max_context_len_to_capture - 1, dtype=torch.int32
-        ).cuda()
-
-        # NOTE: Capturing the largest batch size first may help reduce the
-        # memory usage of CUDA graph.
-        for batch_size in reversed(batch_size_capture_list):
-            if self.verbose:
-                self.logger.info(f"batch size {batch_size} graph capturing")
-
-            input_meta_data = InputMetaData(
-                block_tables=block_tables[:batch_size],
-                sequence_lengths=sequence_lengths[:batch_size],
-                fd_inter_tensor=fd_inter_tensor,
-                batch_size=batch_size,
-                is_prompts=False,
-                use_cuda_graph=True,
-                high_precision=False,
-                kv_seq_len=sequence_lengths[:batch_size].max().item(),
-                head_dim=head_dim,
-                dtype=self.dtype,
-            )
-
-            graph_runner = CUDAGraphRunner(self.model)
-            graph_runner.capture(
-                input_tokens_ids[:batch_size],
-                output_tensor[:batch_size],
-                input_meta_data,
-                k_caches=k_cache,
-                v_caches=v_cache,
-                memory_pool=self.graph_memory_pool,
-            )
-            self.graph_memory_pool = graph_runner.graph.pool()
-            self.graph_runners[batch_size] = graph_runner
-
-        t_capture_end = time.perf_counter()
-
-        if self.verbose:
-            self.logger.info(f"CUDA Graph capture time: {t_capture_end - t_capture_begin} s")
+        self._initialized = True
+        self._verify_args()
 
     def _verify_args(self) -> None:
         """Verify the input args"""
-        if not isinstance(self.inference_config, InferenceConfig):
-            raise TypeError("Invalid type of inference config provided.")
-        if not isinstance(self.model, nn.Module):
-            raise TypeError(f"the model type must be nn.Module, but got {type(self.model)}")
-        if not isinstance(self.tokenizer, (PreTrainedTokenizerFast, PreTrainedTokenizer)):
-            raise TypeError(
-                f"the tokenizer type must be PreTrainedTokenizer or PreTrainedTokenizerFast, but got {type(self.tokenizer)}"
-            )
-        if isinstance(self.model, ModelWrapper):
-            model = self.model.module
-        assert (
-            model.__class__.__name__ in _supported_models.keys()
-        ), f"Model {self.model.__class__.__name__} is not supported."
-
-    def _shardformer(
-        self,
-        model: nn.Module,
-        model_policy: Policy,
-        model_shard_infer_config: ModelShardInferenceConfig = None,
-        stage_manager: PipelineStageManager = None,
-        tp_group: ProcessGroupMesh = None,
-    ) -> nn.Module:
-        """
-        Initialize ShardConfig and replace the model with shardformer.
-
-        Args:
-            model (nn.Module): Path or nn.Module of this model.
-            model_policy (Policy): The policy to shardformer model which is determined by the model type.
-            stage_manager (PipelineStageManager, optional): Used to manage pipeline stages. Defaults to None.
-            tp_group (ProcessGroupMesh, optional): Used to manage the process TP group mesh. Defaults to None.
-
-        Returns:
-            nn.Module: The model optimized by Shardformer.
-        """
-
-        shardconfig = ShardConfig(
-            tensor_parallel_process_group=tp_group,
-            pipeline_stage_manager=stage_manager,
-            enable_tensor_parallelism=(self.inference_config.tp_size > 1),
-            enable_fused_normalization=False,
-            enable_all_optimization=False,
-            enable_flash_attention=False,
-            enable_jit_fused=False,
-            enable_sequence_parallelism=False,
-            extra_kwargs={"model_shard_infer_config": model_shard_infer_config},
-        )
-        shardformer = ShardFormer(shard_config=shardconfig)
-        shard_model, _ = shardformer.optimize(model, model_policy)
-        return shard_model
-
-    def enable_spec_dec(
-        self,
-        drafter_model: nn.Module = None,
-        n_spec_tokens: int = None,
-        use_glide_drafter: bool = False,
-    ) -> None:
-        """Initialize drafter (if it has not yet), and enable Speculative Decoding for subsequent generations.
-
-        Args:
-            drafter_model (nn.Module): The drafter model (small model) used to speculate tokens.
-                If provided, the previous drafter and drafter model, if exist, will be overwritten.
-            n_spec_tokens (Optional[int]): The number of tokens to speculate in each round of speculating-verifying.
-                If not provided, `max_n_spec_tokens` in InferenceConfig will be used.
-            use_glide_drafter (bool): Whether to use glide model for speculative decoding. Defaults to False.
-                If True, the drafter model will be replaced by a glide model.
-
-        ```python
-        ...
-        engine = InferenceEngine(model, tokenizer, inference_config)
-
-        engine.enable_spec_dec(drafter_model, n_spec_tokens=5)
-        engine.generate(...)  # Speculative Decoding
-
-        engine.disable_spec_dec()
-        engine.generate(...)  # Normal generation
-
-        engine.enable_spec_dec()
-        engine.generate(...)  # Speculative-Decoding using previously set drafter model and number of spec tokens
-        engine.clear_spec_dec()
-        ```
-        """
-
-        if drafter_model is None and self.drafter is None:
-            raise ValueError("Drafter not initialized. Please provide a Drafter Model")
-        if n_spec_tokens is not None:
-            assert 1 < n_spec_tokens <= self.inference_config.max_n_spec_tokens
-            self.n_spec_tokens = n_spec_tokens
-        if drafter_model is not None:
-            assert isinstance(drafter_model, nn.Module)
-            # overwrite the drafter, if exists
-            self.clear_spec_dec()
-            self.drafter_model = drafter_model
-            self.drafter = Drafter(
-                self.drafter_model,
-                self.tokenizer,
-                device=self.device,
-                dtype=self.dtype,
-            )
-
-            # check if the provided drafter model is compatible with GLIDE structure
-            # when `use_glide_drafter` is set to True
-            if (
-                use_glide_drafter
-                and hasattr(drafter_model, "model")
-                and hasattr(drafter_model.model, "layers")
-                and hasattr(drafter_model.model.layers[0], "cross_attn")
-            ):
-                self.use_glide = use_glide_drafter
-            elif use_glide_drafter:
-                self.logger.warning(
-                    f"`use_glide_drafter` is provided as {use_glide_drafter}, "
-                    f"but the provided drafter model is not compatible with GLIDE structure."
-                    f"Falling back to use the default drafter model (non-GLIDE)."
-                )
-        self.request_handler.set_spec_dec_mode(self.n_spec_tokens)
-        # using speculative decoding for subsequent generations
-        self.use_spec_dec = True
-
-    def disable_spec_dec(self) -> None:
-        """Disable using speculative decoding for subsequent generations."""
-        self.request_handler.unset_spec_dec_mode()
-        # set back to the maximum number of tokens to speculate
-        self.n_spec_tokens = self.inference_config.max_n_spec_tokens
-        self.use_glide = False
-        self.use_spec_dec = False
-
-    def clear_spec_dec(self) -> None:
-        """Clear relatable structures of speculative decoding, if exist."""
-        if self.use_spec_dec:
-            self.disable_spec_dec()
-        if self.drafter_model or self.drafter:
-            self.drafter_model = None
-            self.drafter = None
-            torch.cuda.empty_cache()
-        self.use_glide = False
-        self.use_spec_dec = False
-
-    def steps_spec_dec(self) -> List[Sequence]:
-        """
-        Run Speculative Decoding steps. This is like retrieving a single batch and launch inference
-        with many steps of speculating by a drafter model as well as verifying by a main model.
-
-        Returns:
-            List[Sequence]: finished sequences generated by one step.
-        """
-        batch = self.request_handler.schedule()  # prefill batch
-        assert batch.current_batch_size == 1, "Only support bsz 1 for speculative decoding for now."
-
-        input_token_ids, output_tensor, input_meta_data = self.prepare_input(batch)
-
-        if input_meta_data.use_cuda_graph:
-            model_executable = self.graph_runners[input_meta_data.batch_size]
-        else:
-            model_executable = self.model
-
-        # 1. Prefill small model (Drafter) - fill past kv cache for drafter model
-        # NOTE For glide drafter models, we won't actually apply glide during prefill stage
-        drafter_out = self.drafter.speculate(input_token_ids, 1, None)
-        next_token_ids_spec = drafter_out.next_tokens
-        drafter_past_key_values = drafter_out.past_key_values
-
-        # 2. Prefill main model (Verifier) - fill past kv cache for main model
-        logits = model_executable(input_token_ids, output_tensor, input_meta_data, self.k_cache, self.v_cache)
-        next_tokens = search_tokens(self.generation_config, logits, batch_token_ids=batch.batch_token_ids)
-        # append new inputs to the batch, temporarily
-        batch.append_batch_tokens(next_tokens)
-        self.request_handler.allocate_batch_spec_dec(batch, 1)
-        already_allocated_kv_len = batch.seq_lengths[0].item()
-        input_token_ids = batch.get_1D_inputs_spec_dec(1)
-
-        finished_sequences = self.request_handler.update()
-
-        while True:
-            # HACK Retrieve the running batch
-            #      Using RequestHandler.schedule here will re-allocate same kv cache for the batch
-            batch = self.request_handler.running_bb  # running batch
-            assert batch.current_batch_size == 1, "Only support bsz 1 for speculative decoding for now."
-
-            # 3. Decoding - Drafter model speculates `n` tokens
-            glide_input = None
-            if self.use_glide:
-                glide_input = GlideInput(
-                    batch.get_block_table_tensor(),
-                    self.k_cache[-1],  # use kv cahces of the last layer
-                    self.v_cache[-1],
-                    batch.get_sequence_lengths(),
-                    n_spec_tokens=self.n_spec_tokens,
-                )
-
-            drafter_out = self.drafter.speculate(
-                input_token_ids,
-                self.n_spec_tokens,
-                drafter_past_key_values,
-                glide_input=glide_input,
-            )
-            next_token_ids_spec = drafter_out.next_tokens
-            drafter_past_key_values = drafter_out.past_key_values
-            drafter_spec_length = drafter_out.speculated_length
-
-            for next_token_id_spec in next_token_ids_spec:
-                self.request_handler.append_next_tokens(next_token_id_spec.unsqueeze(0))
-            cur_length = batch.seq_lengths[0].item()
-            if already_allocated_kv_len < cur_length:
-                self.request_handler.allocate_batch_spec_dec(batch, n=cur_length - already_allocated_kv_len)
-                already_allocated_kv_len = cur_length
-
-            # 4. Decoding - Main model verifies `n` tokens in parallel
-            if drafter_spec_length < batch.num_tokens_to_verify:
-                batch.set_use_spec_dec(num_tokens_to_verify=drafter_spec_length)
-            input_token_ids, output_tensor, input_meta_data = self.prepare_input(batch)
-            logits = model_executable(input_token_ids, output_tensor, input_meta_data, self.k_cache, self.v_cache)
-
-            next_tokens = search_tokens(self.generation_config, logits, batch_token_ids=batch.batch_token_ids)
-
-            # 5. Compare and process the results
-            diff_indexes = torch.nonzero(~(next_tokens[:-1] == next_token_ids_spec))
-            n_matches = drafter_spec_length if diff_indexes.size(0) == 0 else diff_indexes[0][0].item()
-
-            # revoke appended tokens for each Sequence in the current batch
-            batch.revoke_batch_tokens(drafter_spec_length - n_matches)  # revoke drafted tokens
-
-            # append the last correct token generated by the main model
-            self.request_handler.append_next_tokens(next_tokens[n_matches].unsqueeze(0))
-
-            # trim past key values of the drafter model
-            drafter_past_key_values = Drafter.trim_kv_cache(
-                drafter_past_key_values, drafter_spec_length - n_matches - 1
-            )
-
-            # prepare inputs for the next round of speculation
-            n = 1 if n_matches < drafter_spec_length else 2
-            input_token_ids = batch.get_1D_inputs_spec_dec(n)
-
-            self.request_handler.update_batch_finished(batch, generation_config=self.generation_config)
-            finished_sequences = self.request_handler.update()
-            if len(finished_sequences) > 0:
-                break
-
-        # Reset back the number of speculated tokens of the batch,
-        # this is used to handle the last round of speculation, in which case the number of speculated tokens
-        # by the drafter is less than the number of speculated tokens set to the engine.
-        batch.set_use_spec_dec(num_tokens_to_verify=self.n_spec_tokens)
-
-        return finished_sequences
+        assert self.engine is not None, "Please init Engine first"
+        assert self._initialized, "Engine must be initialized"
 
     def generate(
         self,
         request_ids: Union[List[int], int] = None,
         prompts: Union[List[str], str] = None,
-        prompts_token_ids: Union[List[int], torch.Tensor, np.ndarray] = None,
-        return_token_ids: bool = False,
-        generation_config: Optional[GenerationConfig] = None,
-    ) -> Union[List[str], Tuple[List[str], List[List[int]]]]:
+        *args,
+        **kwargs,
+    ) -> Union[List[Union[str, List[PIL.Image.Image], np.ndarray]], Tuple[List[str], List[List[int]]]]:
         """
         Executing the inference step.
 
         Args:
             request_ids (List[int], optional): The request ID. Defaults to None.
             prompts (Union[List[str], optional): Input prompts. Defaults to None.
-            prompts_token_ids (Union[List[int], torch.Tensor, np.ndarray], optional): token ids of input prompts. Defaults to None.
-            return_token_ids (bool, optional): Whether to return output token ids. Defaults to False.
-            generation_config (Optional[GenerationConfig], optional): Huggingface GenerationConfig used for inference. Defaults to None.
-
-        Returns:
-            Union[List[str], Tuple[List[str], List[List[int]]]]: Inference result returned by one generation.
-        """
-
-        gen_config_dict = generation_config.to_dict() if generation_config is not None else {}
-        prompts = [prompts] if isinstance(prompts, str) else prompts
-        request_ids = [request_ids] if isinstance(request_ids, int) else request_ids
-
-        with torch.inference_mode():
-            if prompts is not None or prompts_token_ids is not None:
-                self.add_request(
-                    request_ids=request_ids,
-                    prompts=prompts,
-                    prompts_token_ids=prompts_token_ids,
-                    **gen_config_dict,
-                )
-
-            output_seqs_list = []
-            total_tokens_list = []
-
-            # intuition: If user provide a generation config, we should replace the existing one.
-            if generation_config is not None:
-                self.generation_config = generation_config
-                self.generation_config_dict = gen_config_dict
-
-            if self.use_spec_dec:
-                assert self.drafter is not None, "Drafter Model is not initialized."
-                while self.request_handler.check_unfinished_seqs():
-                    output_seqs_list += self.steps_spec_dec()
-            else:
-                while self.request_handler.check_unfinished_seqs():
-                    output_seqs_list += self.step()
-
-            output_seqs_list = sorted(output_seqs_list, key=lambda x: int(x.request_id))
-
-            for seq in output_seqs_list:
-                total_tokens_list.append(seq.input_token_id + seq.output_token_id)
-
-            output_str = self.tokenizer.batch_decode(total_tokens_list, skip_special_tokens=True)
-
-            if return_token_ids:
-                output_tokens_list = [seq.output_token_id for seq in output_seqs_list]
-                return output_str, output_tokens_list
-            else:
-                return output_str
-
-    @property
-    def has_prompt_template(self) -> bool:
-        """ """
-        return self.inference_config.prompt_template is not None
-
-    def format_prompt(self, prompts: Union[List[str], str]) -> Union[List[str], str]:
-        """
-        This method will format the input prompt according to the prompt template given to the InferenceConfig.
         """
-        assert (
-            self.has_prompt_template
-        ), "Found the prompt_template is None. Please provide a valid prompt_template in InferenceConfig."
 
-        if isinstance(prompts, (list, tuple)):
-            return [self.inference_config.prompt_template.format(input_text=prompt) for prompt in prompts]
-        elif isinstance(prompts, str):
-            return self.inference_config.prompt_template.format(input_text=prompts)
-        else:
-            raise TypeError(f"Expected the input prompt to be one of list, tuple, or str, but got {type(prompts)}.")
+        assert self.engine is not None, "Please init Engine first"
+        return self.engine.generate(request_ids=request_ids, prompts=prompts, *args, **kwargs)
 
     def add_request(
         self,
         request_ids: Union[List[int], int] = None,
         prompts: Union[List[str], str] = None,
-        prompts_token_ids: Union[List[int], torch.Tensor, np.ndarray] = None,
+        *args,
         **kwargs,
     ) -> None:
         """
@@ -630,168 +98,36 @@ def add_request(
             request_ids (List[int], optional): The request ID. Defaults to None.
             prompts (Union[List[str], optional): Input prompts. Defaults to None.
             prompts_token_ids (List[List[int]], optional): token ids of input prompts. Defaults to None.
+            kwargs: for LLM, it could be max_length, max_new_tokens, etc
+                    for diffusion, it could be prompt_2, prompt_3, num_images_per_prompt, do_classifier_free_guidance, negative_prompt, negative_prompt_2, negative_prompt_3, prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds, clip_skip, which aligns with diffusers
         """
+        assert self.engine is not None, "Please init Engine first"
+        self.engine.add_request(request_ids=request_ids, prompts=prompts, *args, **kwargs)
 
-        # apply the prompt template to the input prompts
-
-        if self.has_prompt_template and prompts is not None:
-            prompts = self.format_prompt(prompts)
-
-        block_size = self.inference_config.block_size
-
-        if request_ids is not None and not isinstance(request_ids, list):
-            request_ids = [request_ids]
-
-        if prompts is not None and not isinstance(prompts, list):
-            prompts = [prompts]
-
-        if prompts_token_ids is None:
-            assert prompts, "When the prompts_token_ids is none, the input prompt list must be provided."
-            prompts_token_ids = self.tokenizer.batch_encode_plus(prompts, padding=self.inference_config.pad_input)[
-                "input_ids"
-            ]
-
-        # list of torch Tensor
-        if isinstance(prompts_token_ids, list):
-            if isinstance(prompts_token_ids[0], torch.Tensor):
-                prompts_token_ids = [prompt_token_id.tolist() for prompt_token_id in prompts_token_ids]
-        elif isinstance(prompts_token_ids, torch.Tensor) or isinstance(prompts_token_ids, np.ndarray):
-            prompts_token_ids = prompts_token_ids.tolist()
-        else:
-            raise TypeError(
-                f"The dtype of prompts_token_ids must be one of list, torch.Tensor, np.ndarray, but got {type(prompts_token_ids)}."
-            )
-
-        assert (
-            len(prompts_token_ids[0]) <= self.inference_config.max_input_len
-        ), f"The length of input prompts {len(prompts_token_ids[0])} must be less than max_input_len {self.inference_config.max_input_len}."
-
-        prompts_num = len(prompts_token_ids)
-
-        for i in range(prompts_num):
-            if request_ids:
-                assert isinstance(
-                    request_ids[0], int
-                ), f"The request_id type must be int, but got {type(request_ids[0])}"
-                assert len(request_ids) == prompts_num
-                request_id = request_ids[i]
-            else:
-                request_id = next(self.counter)
-            if prompts == None:
-                prompt = None
-            else:
-                prompt = prompts[i]
-
-            max_length = kwargs.get("max_length", None)
-            max_new_tokens = kwargs.get("max_new_tokens", None)
-            if max_length is None and max_new_tokens is None:
-                max_new_tokens = self.generation_config.max_new_tokens or self.inference_config.max_output_len
-            elif max_length is not None:
-                max_new_tokens = max_length - len(prompts_token_ids[i])
+    def step(self):
+        assert self.engine is not None, "Please init Engine first"
+        return self.engine.step()
 
-            if not self.inference_config.enable_streamingllm:
-                assert (
-                    self.inference_config.max_output_len >= max_new_tokens
-                ), f"max_new_tokens={max_new_tokens} must be less than max_output_len={self.inference_config.max_output_len}."
-
-            sequence = Sequence(
-                request_id,
-                prompt,
-                prompts_token_ids[i],
-                block_size,
-                None,
-                self.tokenizer.eos_token_id,
-                self.tokenizer.pad_token_id,
-                max_output_len=max_new_tokens,
-                ignore_eos=self.inference_config.ignore_eos,
-            )
-            self.request_handler.add_sequence(sequence)
-
-    def prepare_input(self, batch: BatchBucket) -> Tuple[torch.Tensor, torch.Tensor, InputMetaData]:
-        input_ids = batch.get_1D_inputs()
-        sequence_lengths = batch.get_sequence_lengths()
-
-        if batch.is_prompts:
-            n_tokens = sequence_lengths.sum().item()
-        else:
-            n_tokens = batch.current_batch_size
-            if batch.use_spec_dec:
-                n_tokens = batch.num_tokens_to_verify + 1
-                assert n_tokens == input_ids.size(0)
-                n_tokens = n_tokens * batch.current_batch_size
-        output_tensor = torch.zeros(
-            (n_tokens, batch.num_heads * batch.head_dim), dtype=batch.dtype, device=batch.device
-        )
-
-        batch_token_ids = None
-        if (
-            self.generation_config.repetition_penalty != 1.0
-            or self.generation_config.no_repeat_ngram_size > 0
-            or self.generation_config.forced_eos_token_id is not None
-        ):
-            batch_token_ids = batch.batch_token_ids
-
-        # only when we have the graph for specific decoding batch size can we use the cuda graph for inference
-        use_cuda_graph = False
-        if self.use_cuda_graph and not batch.is_prompts and batch.current_batch_size in self.graph_runners.keys():
-            use_cuda_graph = True
-
-        input_meta_data = InputMetaData(
-            block_tables=batch.get_block_table_tensor(),
-            sequence_lengths=sequence_lengths,
-            fd_inter_tensor=batch.fd_inter_tensor,
-            batch_size=batch.current_batch_size,
-            is_prompts=batch.is_prompts,
-            use_cuda_kernel=self.inference_config.use_cuda_kernel,
-            use_cuda_graph=use_cuda_graph,
-            high_precision=self.high_precision,
-            kv_seq_len=sequence_lengths.max().item(),
-            head_dim=batch.head_dim,
-            dtype=batch.dtype,
-            use_spec_dec=batch.use_spec_dec,
-            num_tokens_to_verify=batch.num_tokens_to_verify,
-            batch_token_ids=batch_token_ids,
-        )
-
-        return input_ids, output_tensor, input_meta_data
-
-    def step(self) -> List[str]:
+    def __getattr__(self, name):
         """
-        In each step, do the follows:
-            1. Run RequestHandler.schedule() and get the batch used for inference.
-            2. Get the input, inputinfo and output placeholder from the batchbucket
-            3. Run model to generate the next token
-            4. Update waiting list and running list in RequestHandler and get finished sequences.
-            5. Decode and return finished sequences.
-
-        Returns:
-            List[str]: Decoded finished sequences generated by one step.
+        The Design logic of getattr, setattr:
+        1. Since InferenceEngine is a wrapper for DiffusionEngine/LLMEngine, we hope to invoke all the member of DiffusionEngine/LLMEngine like we just call the member of InferenceEngine.
+        2. When we call the __init__ of InferenceEngine, we don't want to setattr using self.__dict__["xxx"] = xxx, we want to use origin ways like self.xxx = xxx
+        So we set the attribute `_initialized`. And after initialized, if we couldn't get the member from InferenceEngine, we will try to get the member from self.engine(DiffusionEngine/LLMEngine)
         """
-
-        batch = self.request_handler.schedule()
-
-        input_token_ids, output_tensor, input_meta_data = self.prepare_input(batch)
-
-        if input_meta_data.use_cuda_graph:
-            model_executable = self.graph_runners[input_meta_data.batch_size]
+        if self.__dict__.get("_initialized", False):
+            if name in self.__dict__:
+                return self.__dict__[name]
+            else:
+                return getattr(self.engine, name)
         else:
-            model_executable = self.model
+            return self.__dict__[name]
 
-        # TODO: padding_id is used for generating attn_mask and will be removed if nopad version is supported.
-        logits = model_executable(input_token_ids, output_tensor, input_meta_data, self.k_cache, self.v_cache)
-        if self.inference_config.pad_input:
-            logits = logits[:, -1, :]
-
-        if self.inference_config.enable_streamingllm:
-            updated_block_ids = batch.streamingllm_update_batch(
-                self.inference_config.start_token_size, self.inference_config.generated_token_size
-            )
-            self.request_handler.streamingllm_free_block_tables(updated_block_ids)
-
-        next_tokens = search_tokens(
-            self.generation_config, logits, input_meta_data.is_prompts, batch_token_ids=input_meta_data.batch_token_ids
-        )
-        self.request_handler.append_next_tokens(next_tokens)
-        finished_sequences = self.request_handler.update()
-
-        return finished_sequences
+    def __setattr__(self, name, value):
+        if self.__dict__.get("_initialized", False):
+            if name in self.__dict__:
+                self.__dict__[name] = value
+            else:
+                setattr(self.engine, name, value)
+        else:
+            self.__dict__[name] = value
diff --git a/colossalai/inference/core/llm_engine.py b/colossalai/inference/core/llm_engine.py
new file mode 100644
index 000000000000..1dbc3ace85b6
--- /dev/null
+++ b/colossalai/inference/core/llm_engine.py
@@ -0,0 +1,758 @@
+import time
+from itertools import count
+from typing import Dict, List, Optional, Tuple, Type, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch import distributed as dist
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    GenerationConfig,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerFast,
+)
+from transformers.models.llama.modeling_llama import LlamaForCausalLM
+
+from colossalai.accelerator import get_accelerator
+from colossalai.cluster import ProcessGroupMesh
+from colossalai.inference.batch_bucket import BatchBucket
+from colossalai.inference.config import InferenceConfig, InputMetaData, ModelShardInferenceConfig
+from colossalai.inference.graph_runner import CUDAGraphRunner
+from colossalai.inference.modeling.policy import model_policy_map
+from colossalai.inference.sampler import search_tokens
+from colossalai.inference.spec import Drafter, GlideInput
+from colossalai.inference.struct import Sequence
+from colossalai.inference.utils import get_model_size, has_index_file
+from colossalai.interface import ModelWrapper
+from colossalai.lazy import LazyInitContext
+from colossalai.logging import get_dist_logger
+from colossalai.shardformer.policies.base_policy import Policy
+
+from .base_engine import BaseEngine
+from .request_handler import RequestHandler
+
+PP_AXIS, TP_AXIS = 0, 1
+
+_supported_models = {
+    "LlamaForCausalLM": LlamaForCausalLM,
+    "BaichuanForCausalLM": AutoModelForCausalLM,
+}
+
+_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [8 * i for i in range(1, 33)]
+
+
+class LLMEngine(BaseEngine):
+    """
+    InferenceEngine which manages the inference process..
+
+    Args:
+        model_or_path (nn.Module or str): Path or nn.Module of this model.
+        tokenizer Optional[(Union[PreTrainedTokenizer, PreTrainedTokenizerFast])]: Path of the tokenizer to use.
+        inference_config (Optional[InferenceConfig], optional): Store the configuration information related to inference.
+        verbose (bool): Determine whether or not to log the generation process.
+        model_policy ("Policy"): the policy to shardformer model. It will be determined by the model type if not provided.
+    """
+
+    def __init__(
+        self,
+        model_or_path: Union[nn.Module, str],
+        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast] = None,
+        inference_config: InferenceConfig = None,
+        verbose: bool = False,
+        model_policy: Union[Policy, type[Policy]] = None,
+    ) -> None:
+        self.inference_config = inference_config
+        self.dtype = inference_config.dtype
+        self.high_precision = inference_config.high_precision
+
+        self.verbose = verbose
+        self.logger = get_dist_logger(__name__)
+        self.model_shard_infer_config = inference_config.to_model_shard_inference_config()
+
+        self.init_model(model_or_path, model_policy, self.model_shard_infer_config)
+
+        self.generation_config = inference_config.to_generation_config(self.model_config)
+        self.generation_config_dict = self.generation_config.to_dict()
+
+        self.tokenizer = tokenizer
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+
+        self.request_handler = RequestHandler(self.inference_config, self.model_config)
+        self.k_cache, self.v_cache = self.request_handler.get_kvcache()
+        # DISCUSS maybe move this into batch info?
+
+        self.counter = count()
+
+        self.use_cuda_graph = self.inference_config.use_cuda_graph
+        if self.use_cuda_graph:
+            self.graph_runners: Dict[int, CUDAGraphRunner] = {}
+            self.graph_memory_pool = None  # Set during graph capture.
+            if verbose:
+                self.logger.info("Colossal AI CUDA Graph Capture on")
+
+            self.capture_model(self.k_cache, self.v_cache)
+
+        # Model and relatable attrs of speculative decoding will be set by `enable_spec_dec`
+        self.use_spec_dec = self.inference_config.use_spec_dec
+
+        self.drafter_model = None
+        self.drafter = None
+        self.use_glide = False
+        self.n_spec_tokens = self.inference_config.max_n_spec_tokens
+
+        self._verify_args()
+
+    def init_model(
+        self,
+        model_or_path: Union[nn.Module, str],
+        model_policy: Union[Policy, Type[Policy]] = None,
+        model_shard_infer_config: ModelShardInferenceConfig = None,
+    ):
+        """
+        Shard model or/and Load weight
+
+        Args:
+            model_or_path Union[nn.Module, str]: path to the checkpoint or model of transformer format.
+            model_policy (Policy): the policy to replace the model.
+            model_inference_config: the configuration for modeling initialization when inference.
+            model_shard_infer_config (ModelShardInferenceConfig): the configuration for init of module when inference.
+        """
+        pretrained_path = None
+        if isinstance(model_or_path, str):
+            import colossalai.interface.pretrained as pretrained_utils
+
+            try:
+                hf_config = AutoConfig.from_pretrained(model_or_path, trust_remote_code=True, torch_dtype=self.dtype)
+                arch = getattr(hf_config, "architectures")[0]
+                if arch in _supported_models.keys():
+                    if arch == "BaichuanForCausalLM":
+                        self.logger.warning(
+                            "Attention ! We use lazy init by default, which could be faster for model loading. For baichuan model, the output maybe have a slight difference with transformers"
+                        )
+                    ctx = LazyInitContext(default_device="cuda")
+                    with ctx:
+                        model = _supported_models[arch].from_pretrained(
+                            model_or_path, trust_remote_code=True, torch_dtype=self.dtype
+                        )
+                    pretrained_path = pretrained_utils.get_pretrained_path(model)
+                else:
+                    # TODO(char-1ee): if the model not supported, use transformers APIs to load and generate
+                    raise ValueError(f"Model {arch} is not supported.")
+
+            except Exception as e:
+                self.logger.error(
+                    f"An exception occurred during loading model: {e}, model should be loaded by transformers\n"
+                )
+        else:
+            model = model_or_path
+
+        self.model_config = model.config
+
+        torch.cuda.empty_cache()
+        init_gpu_memory = torch.cuda.mem_get_info()[0]
+
+        self.device = get_accelerator().get_current_device()
+        if self.verbose:
+            self.logger.info(f"the device is {self.device}")
+
+        model = model.to(self.dtype).eval()
+
+        if self.verbose:
+            self.logger.info(
+                f"Before the shard, Rank: [{dist.get_rank()}], model size: {get_model_size(model)} GB, model's device is: {model.device}"
+            )
+
+        if model_policy is None:
+            prefix = "nopadding" if not self.inference_config.pad_input else "padding"
+            model_policy_key = f"{prefix}_{getattr(self.model_config, 'model_type', None)}"
+            model_policy = model_policy_map.get(model_policy_key)
+
+        if not isinstance(model_policy, Policy):
+            try:
+                model_policy = model_policy()
+            except Exception as e:
+                raise ValueError(f"Unable to instantiate model policy: {e}")
+
+        assert isinstance(model_policy, Policy), f"Invalid type of model policy: {type(model_policy)}"
+        pg_mesh = ProcessGroupMesh(self.inference_config.pp_size, self.inference_config.tp_size)
+        tp_group = pg_mesh.get_group_along_axis(TP_AXIS)
+
+        self.model = self._shardformer(
+            model,
+            model_policy,
+            model_shard_infer_config,
+            None,
+            tp_group=tp_group,
+        )
+
+        self.model = ModelWrapper(model).to(self.device)
+
+        if self.verbose:
+            self.logger.info(
+                f"After the shard, Rank: [{dist.get_rank()}], model size: {get_model_size(self.model)} GB, model's device is: {model.device}"
+            )
+
+        if pretrained_path:
+            from colossalai.inference.core.plugin import InferCheckpoint_io
+
+            cpt_io = InferCheckpoint_io()
+            if_has_index_file, model_index_file = has_index_file(pretrained_path)
+            assert if_has_index_file, "the model path is invalid"
+            cpt_io.load_model(self.model, model_index_file)
+
+        free_gpu_memory, _ = torch.cuda.mem_get_info()
+        peak_memory = init_gpu_memory - free_gpu_memory
+        if self.verbose:
+            self.logger.info(
+                f"Rank [{dist.get_rank()}], Model Weight Max Occupy {peak_memory / (1024 ** 3)} GB, Model size: {get_model_size(self.model)} GB"
+            )
+
+    @torch.inference_mode()
+    def capture_model(self, k_cache: List[torch.Tensor], v_cache: List[torch.Tensor]):
+        assert self.use_cuda_graph, "please turn on the cuda graph"
+
+        if self.verbose:
+            self.logger.info("Colossal AI CUDA Graph Capture begin")
+
+        t_capture_begin = time.perf_counter()
+
+        block_size = self.inference_config.block_size
+        head_dim = self.model_config.hidden_size // self.model_config.num_attention_heads
+
+        # Prepare dummy inputs. These will be reused for all batch sizes.
+        max_batch_size = max(_BATCH_SIZES_TO_CAPTURE)
+        max_context_len_to_capture = self.inference_config.max_context_len_to_capture
+        max_num_blocks = (max_context_len_to_capture + block_size - 1) // block_size
+        input_tokens_ids = torch.zeros(max_batch_size, dtype=torch.long).cuda()
+        # self.graph_block_tables = np.zeros((max(_BATCH_SIZES_TO_CAPTURE), max_num_blocks), dtype=np.int32)
+        self.graph_block_tables = np.full((max(_BATCH_SIZES_TO_CAPTURE), max_num_blocks), -1, dtype=np.int32)
+        self.graph_block_tables[:, 0] = np.arange(max_num_blocks, max_num_blocks + max(_BATCH_SIZES_TO_CAPTURE))
+        self.graph_block_tables[0, :] = np.arange(
+            0, max_num_blocks
+        )  # NOTE this is a hack to insure cuda grpah could capture the fixed cuda kernel grid in flash decoding, to make the first seqlen as the max_seq_len
+        block_tables = torch.from_numpy(self.graph_block_tables).cuda()
+        output_tensor = torch.zeros(
+            (max_batch_size, self.model_config.num_attention_heads * head_dim), dtype=self.dtype, device=self.device
+        )
+        fd_inter_tensor = self.request_handler.running_bb.fd_inter_tensor
+
+        max_num_seqs = self.inference_config.max_batch_size
+        batch_size_capture_list = [bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= max_num_seqs]
+        sequence_lengths = torch.ones(max_batch_size, dtype=torch.int).cuda()
+        # NOTE this is a hack to insure cuda grpah could capture the fixed cuda kernel grid in flash decoding, to make the first seqlen as the max_seq_len
+        sequence_lengths[0] = torch.tensor(
+            self.inference_config.max_context_len_to_capture - 1, dtype=torch.int32
+        ).cuda()
+
+        # NOTE: Capturing the largest batch size first may help reduce the
+        # memory usage of CUDA graph.
+        for batch_size in reversed(batch_size_capture_list):
+            if self.verbose:
+                self.logger.info(f"batch size {batch_size} graph capturing")
+
+            input_meta_data = InputMetaData(
+                block_tables=block_tables[:batch_size],
+                sequence_lengths=sequence_lengths[:batch_size],
+                fd_inter_tensor=fd_inter_tensor,
+                batch_size=batch_size,
+                is_prompts=False,
+                use_cuda_graph=True,
+                high_precision=False,
+                kv_seq_len=sequence_lengths[:batch_size].max().item(),
+                head_dim=head_dim,
+                dtype=self.dtype,
+            )
+
+            graph_runner = CUDAGraphRunner(self.model)
+            graph_runner.capture(
+                input_tokens_ids[:batch_size],
+                output_tensor[:batch_size],
+                input_meta_data,
+                k_caches=k_cache,
+                v_caches=v_cache,
+                memory_pool=self.graph_memory_pool,
+            )
+            self.graph_memory_pool = graph_runner.graph.pool()
+            self.graph_runners[batch_size] = graph_runner
+
+        t_capture_end = time.perf_counter()
+
+        if self.verbose:
+            self.logger.info(f"CUDA Graph capture time: {t_capture_end - t_capture_begin} s")
+
+    def _verify_args(self) -> None:
+        """Verify the input args"""
+        if not isinstance(self.inference_config, InferenceConfig):
+            raise TypeError("Invalid type of inference config provided.")
+        if not isinstance(self.model, nn.Module):
+            raise TypeError(f"the model type must be nn.Module, but got {type(self.model)}")
+        if not isinstance(self.tokenizer, (PreTrainedTokenizerFast, PreTrainedTokenizer)):
+            raise TypeError(
+                f"the tokenizer type must be PreTrainedTokenizer or PreTrainedTokenizerFast, but got {type(self.tokenizer)}"
+            )
+        if isinstance(self.model, ModelWrapper):
+            model = self.model.module
+        assert (
+            model.__class__.__name__ in _supported_models.keys()
+        ), f"Model {self.model.__class__.__name__} is not supported."
+
+    def enable_spec_dec(
+        self,
+        drafter_model: nn.Module = None,
+        n_spec_tokens: int = None,
+        use_glide_drafter: bool = False,
+    ) -> None:
+        """Initialize drafter (if it has not yet), and enable Speculative Decoding for subsequent generations.
+
+        Args:
+            drafter_model (nn.Module): The drafter model (small model) used to speculate tokens.
+                If provided, the previous drafter and drafter model, if exist, will be overwritten.
+            n_spec_tokens (Optional[int]): The number of tokens to speculate in each round of speculating-verifying.
+                If not provided, `max_n_spec_tokens` in InferenceConfig will be used.
+            use_glide_drafter (bool): Whether to use glide model for speculative decoding. Defaults to False.
+                If True, the drafter model will be replaced by a glide model.
+
+        ```python
+        ...
+        engine = InferenceEngine(model, tokenizer, inference_config)
+
+        engine.enable_spec_dec(drafter_model, n_spec_tokens=5)
+        engine.generate(...)  # Speculative Decoding
+
+        engine.disable_spec_dec()
+        engine.generate(...)  # Normal generation
+
+        engine.enable_spec_dec()
+        engine.generate(...)  # Speculative-Decoding using previously set drafter model and number of spec tokens
+        engine.clear_spec_dec()
+        ```
+        """
+
+        if drafter_model is None and self.drafter is None:
+            raise ValueError("Drafter not initialized. Please provide a Drafter Model")
+        if n_spec_tokens is not None:
+            assert 1 < n_spec_tokens <= self.inference_config.max_n_spec_tokens
+            self.n_spec_tokens = n_spec_tokens
+        if drafter_model is not None:
+            assert isinstance(drafter_model, nn.Module)
+            # overwrite the drafter, if exists
+            self.clear_spec_dec()
+            self.drafter_model = drafter_model
+            self.drafter = Drafter(
+                self.drafter_model,
+                self.tokenizer,
+                device=self.device,
+                dtype=self.dtype,
+            )
+
+            # check if the provided drafter model is compatible with GLIDE structure
+            # when `use_glide_drafter` is set to True
+            if (
+                use_glide_drafter
+                and hasattr(drafter_model, "model")
+                and hasattr(drafter_model.model, "layers")
+                and hasattr(drafter_model.model.layers[0], "cross_attn")
+            ):
+                self.use_glide = use_glide_drafter
+            elif use_glide_drafter:
+                self.logger.warning(
+                    f"`use_glide_drafter` is provided as {use_glide_drafter}, "
+                    f"but the provided drafter model is not compatible with GLIDE structure."
+                    f"Falling back to use the default drafter model (non-GLIDE)."
+                )
+        self.request_handler.set_spec_dec_mode(self.n_spec_tokens)
+        # using speculative decoding for subsequent generations
+        self.use_spec_dec = True
+
+    def disable_spec_dec(self) -> None:
+        """Disable using speculative decoding for subsequent generations."""
+        self.request_handler.unset_spec_dec_mode()
+        # set back to the maximum number of tokens to speculate
+        self.n_spec_tokens = self.inference_config.max_n_spec_tokens
+        self.use_glide = False
+        self.use_spec_dec = False
+
+    def clear_spec_dec(self) -> None:
+        """Clear relatable structures of speculative decoding, if exist."""
+        if self.use_spec_dec:
+            self.disable_spec_dec()
+        if self.drafter_model or self.drafter:
+            self.drafter_model = None
+            self.drafter = None
+            torch.cuda.empty_cache()
+        self.use_glide = False
+        self.use_spec_dec = False
+
+    def steps_spec_dec(self) -> List[Sequence]:
+        """
+        Run Speculative Decoding steps. This is like retrieving a single batch and launch inference
+        with many steps of speculating by a drafter model as well as verifying by a main model.
+
+        Returns:
+            List[Sequence]: finished sequences generated by one step.
+        """
+        batch = self.request_handler.schedule()  # prefill batch
+        assert batch.current_batch_size == 1, "Only support bsz 1 for speculative decoding for now."
+
+        input_token_ids, output_tensor, input_meta_data = self.prepare_input(batch)
+
+        if input_meta_data.use_cuda_graph:
+            model_executable = self.graph_runners[input_meta_data.batch_size]
+        else:
+            model_executable = self.model
+
+        # 1. Prefill small model (Drafter) - fill past kv cache for drafter model
+        # NOTE For glide drafter models, we won't actually apply glide during prefill stage
+        drafter_out = self.drafter.speculate(input_token_ids, 1, None)
+        next_token_ids_spec = drafter_out.next_tokens
+        drafter_past_key_values = drafter_out.past_key_values
+
+        # 2. Prefill main model (Verifier) - fill past kv cache for main model
+        logits = model_executable(input_token_ids, output_tensor, input_meta_data, self.k_cache, self.v_cache)
+        next_tokens = search_tokens(self.generation_config, logits, batch_token_ids=batch.batch_token_ids)
+        # append new inputs to the batch, temporarily
+        batch.append_batch_tokens(next_tokens)
+        self.request_handler.allocate_batch_spec_dec(batch, 1)
+        already_allocated_kv_len = batch.seq_lengths[0].item()
+        input_token_ids = batch.get_1D_inputs_spec_dec(1)
+
+        finished_sequences = self.request_handler.update()
+
+        while True:
+            # HACK Retrieve the running batch
+            #      Using RequestHandler.schedule here will re-allocate same kv cache for the batch
+            batch = self.request_handler.running_bb  # running batch
+            assert batch.current_batch_size == 1, "Only support bsz 1 for speculative decoding for now."
+
+            # 3. Decoding - Drafter model speculates `n` tokens
+            glide_input = None
+            if self.use_glide:
+                glide_input = GlideInput(
+                    batch.get_block_table_tensor(),
+                    self.k_cache[-1],  # use kv cahces of the last layer
+                    self.v_cache[-1],
+                    batch.get_sequence_lengths(),
+                    n_spec_tokens=self.n_spec_tokens,
+                )
+
+            drafter_out = self.drafter.speculate(
+                input_token_ids,
+                self.n_spec_tokens,
+                drafter_past_key_values,
+                glide_input=glide_input,
+            )
+            next_token_ids_spec = drafter_out.next_tokens
+            drafter_past_key_values = drafter_out.past_key_values
+            drafter_spec_length = drafter_out.speculated_length
+
+            for next_token_id_spec in next_token_ids_spec:
+                self.request_handler.append_next_tokens(next_token_id_spec.unsqueeze(0))
+            cur_length = batch.seq_lengths[0].item()
+            if already_allocated_kv_len < cur_length:
+                self.request_handler.allocate_batch_spec_dec(batch, n=cur_length - already_allocated_kv_len)
+                already_allocated_kv_len = cur_length
+
+            # 4. Decoding - Main model verifies `n` tokens in parallel
+            if drafter_spec_length < batch.num_tokens_to_verify:
+                batch.set_use_spec_dec(num_tokens_to_verify=drafter_spec_length)
+            input_token_ids, output_tensor, input_meta_data = self.prepare_input(batch)
+            logits = model_executable(input_token_ids, output_tensor, input_meta_data, self.k_cache, self.v_cache)
+
+            next_tokens = search_tokens(self.generation_config, logits, batch_token_ids=batch.batch_token_ids)
+
+            # 5. Compare and process the results
+            diff_indexes = torch.nonzero(~(next_tokens[:-1] == next_token_ids_spec))
+            n_matches = drafter_spec_length if diff_indexes.size(0) == 0 else diff_indexes[0][0].item()
+
+            # revoke appended tokens for each Sequence in the current batch
+            batch.revoke_batch_tokens(drafter_spec_length - n_matches)  # revoke drafted tokens
+
+            # append the last correct token generated by the main model
+            self.request_handler.append_next_tokens(next_tokens[n_matches].unsqueeze(0))
+
+            # trim past key values of the drafter model
+            drafter_past_key_values = Drafter.trim_kv_cache(
+                drafter_past_key_values, drafter_spec_length - n_matches - 1
+            )
+
+            # prepare inputs for the next round of speculation
+            n = 1 if n_matches < drafter_spec_length else 2
+            input_token_ids = batch.get_1D_inputs_spec_dec(n)
+
+            self.request_handler.update_batch_finished(batch, generation_config=self.generation_config)
+            finished_sequences = self.request_handler.update()
+            if len(finished_sequences) > 0:
+                break
+
+        # Reset back the number of speculated tokens of the batch,
+        # this is used to handle the last round of speculation, in which case the number of speculated tokens
+        # by the drafter is less than the number of speculated tokens set to the engine.
+        batch.set_use_spec_dec(num_tokens_to_verify=self.n_spec_tokens)
+
+        return finished_sequences
+
+    def generate(
+        self,
+        request_ids: Union[List[int], int] = None,
+        prompts: Union[List[str], str] = None,
+        prompts_token_ids: Union[List[int], torch.Tensor, np.ndarray] = None,
+        return_token_ids: bool = False,
+        generation_config: Optional[GenerationConfig] = None,
+    ) -> Union[List[str], Tuple[List[str], List[List[int]]]]:
+        """
+        Executing the inference step.
+
+        Args:
+            request_ids (List[int], optional): The request ID. Defaults to None.
+            prompts (Union[List[str], optional): Input prompts. Defaults to None.
+            prompts_token_ids (Union[List[int], torch.Tensor, np.ndarray], optional): token ids of input prompts. Defaults to None.
+            return_token_ids (bool, optional): Whether to return output token ids. Defaults to False.
+            generation_config (Optional[GenerationConfig], optional): Huggingface GenerationConfig used for inference. Defaults to None.
+
+        Returns:
+            Union[List[str], Tuple[List[str], List[List[int]]]]: Inference result returned by one generation.
+        """
+
+        gen_config_dict = generation_config.to_dict() if generation_config is not None else {}
+        prompts = [prompts] if isinstance(prompts, str) else prompts
+        request_ids = [request_ids] if isinstance(request_ids, int) else request_ids
+
+        with torch.inference_mode():
+            if prompts is not None or prompts_token_ids is not None:
+                self.add_request(
+                    request_ids=request_ids,
+                    prompts=prompts,
+                    prompts_token_ids=prompts_token_ids,
+                    **gen_config_dict,
+                )
+
+            output_seqs_list = []
+            total_tokens_list = []
+
+            # intuition: If user provide a generation config, we should replace the existing one.
+            if generation_config is not None:
+                self.generation_config = generation_config
+                self.generation_config_dict = gen_config_dict
+
+            if self.use_spec_dec:
+                assert self.drafter is not None, "Drafter Model is not initialized."
+                while self.request_handler.check_unfinished_reqs():
+                    output_seqs_list += self.steps_spec_dec()
+            else:
+                while self.request_handler.check_unfinished_reqs():
+                    output_seqs_list += self.step()
+
+            output_seqs_list = sorted(output_seqs_list, key=lambda x: int(x.request_id))
+
+            for seq in output_seqs_list:
+                total_tokens_list.append(seq.input_token_id + seq.output_token_id)
+
+            output_str = self.tokenizer.batch_decode(total_tokens_list, skip_special_tokens=True)
+
+            if return_token_ids:
+                output_tokens_list = [seq.output_token_id for seq in output_seqs_list]
+                return output_str, output_tokens_list
+            else:
+                return output_str
+
+    @property
+    def has_prompt_template(self) -> bool:
+        """ """
+        return self.inference_config.prompt_template is not None
+
+    def format_prompt(self, prompts: Union[List[str], str]) -> Union[List[str], str]:
+        """
+        This method will format the input prompt according to the prompt template given to the InferenceConfig.
+        """
+        assert (
+            self.has_prompt_template
+        ), "Found the prompt_template is None. Please provide a valid prompt_template in InferenceConfig."
+
+        if isinstance(prompts, (list, tuple)):
+            return [self.inference_config.prompt_template.format(input_text=prompt) for prompt in prompts]
+        elif isinstance(prompts, str):
+            return self.inference_config.prompt_template.format(input_text=prompts)
+        else:
+            raise TypeError(f"Expected the input prompt to be one of list, tuple, or str, but got {type(prompts)}.")
+
+    def add_request(
+        self,
+        request_ids: Union[List[int], int] = None,
+        prompts: Union[List[str], str] = None,
+        prompts_token_ids: Union[List[int], torch.Tensor, np.ndarray] = None,
+        **kwargs,
+    ) -> None:
+        """
+        Add requests.
+
+        Args:
+            request_ids (List[int], optional): The request ID. Defaults to None.
+            prompts (Union[List[str], optional): Input prompts. Defaults to None.
+            prompts_token_ids (List[List[int]], optional): token ids of input prompts. Defaults to None.
+        """
+
+        # apply the prompt template to the input prompts
+
+        if self.has_prompt_template and prompts is not None:
+            prompts = self.format_prompt(prompts)
+
+        block_size = self.inference_config.block_size
+
+        if request_ids is not None and not isinstance(request_ids, list):
+            request_ids = [request_ids]
+
+        if prompts is not None and not isinstance(prompts, list):
+            prompts = [prompts]
+
+        if prompts_token_ids is None:
+            assert prompts, "When the prompts_token_ids is none, the input prompt list must be provided."
+            prompts_token_ids = self.tokenizer.batch_encode_plus(prompts, padding=self.inference_config.pad_input)[
+                "input_ids"
+            ]
+
+        # list of torch Tensor
+        if isinstance(prompts_token_ids, list):
+            if isinstance(prompts_token_ids[0], torch.Tensor):
+                prompts_token_ids = [prompt_token_id.tolist() for prompt_token_id in prompts_token_ids]
+        elif isinstance(prompts_token_ids, torch.Tensor) or isinstance(prompts_token_ids, np.ndarray):
+            prompts_token_ids = prompts_token_ids.tolist()
+        else:
+            raise TypeError(
+                f"The dtype of prompts_token_ids must be one of list, torch.Tensor, np.ndarray, but got {type(prompts_token_ids)}."
+            )
+
+        assert (
+            len(prompts_token_ids[0]) <= self.inference_config.max_input_len
+        ), f"The length of input prompts {len(prompts_token_ids[0])} must be less than max_input_len {self.inference_config.max_input_len}."
+
+        prompts_num = len(prompts_token_ids)
+
+        for i in range(prompts_num):
+            if request_ids:
+                assert isinstance(
+                    request_ids[0], int
+                ), f"The request_id type must be int, but got {type(request_ids[0])}"
+                assert len(request_ids) == prompts_num
+                request_id = request_ids[i]
+            else:
+                request_id = next(self.counter)
+            if prompts == None:
+                prompt = None
+            else:
+                prompt = prompts[i]
+
+            max_length = kwargs.get("max_length", None)
+            max_new_tokens = kwargs.get("max_new_tokens", None)
+            if max_length is None and max_new_tokens is None:
+                max_new_tokens = self.generation_config.max_new_tokens or self.inference_config.max_output_len
+            elif max_length is not None:
+                max_new_tokens = max_length - len(prompts_token_ids[i])
+
+            if not self.inference_config.enable_streamingllm:
+                assert (
+                    self.inference_config.max_output_len >= max_new_tokens
+                ), f"max_new_tokens={max_new_tokens} must be less than max_output_len={self.inference_config.max_output_len}."
+
+            sequence = Sequence(
+                request_id,
+                prompt,
+                prompts_token_ids[i],
+                block_size,
+                None,
+                self.tokenizer.eos_token_id,
+                self.tokenizer.pad_token_id,
+                max_output_len=max_new_tokens,
+                ignore_eos=self.inference_config.ignore_eos,
+            )
+            self.request_handler.add_sequence(sequence)
+
+    def prepare_input(self, batch: BatchBucket) -> Tuple[torch.Tensor, torch.Tensor, InputMetaData]:
+        input_ids = batch.get_1D_inputs()
+        sequence_lengths = batch.get_sequence_lengths()
+
+        if batch.is_prompts:
+            n_tokens = sequence_lengths.sum().item()
+        else:
+            n_tokens = batch.current_batch_size
+            if batch.use_spec_dec:
+                n_tokens = batch.num_tokens_to_verify + 1
+                assert n_tokens == input_ids.size(0)
+                n_tokens = n_tokens * batch.current_batch_size
+        output_tensor = torch.zeros(
+            (n_tokens, batch.num_heads * batch.head_dim), dtype=batch.dtype, device=batch.device
+        )
+
+        batch_token_ids = None
+        if (
+            self.generation_config.repetition_penalty != 1.0
+            or self.generation_config.no_repeat_ngram_size > 0
+            or self.generation_config.forced_eos_token_id is not None
+        ):
+            batch_token_ids = batch.batch_token_ids
+
+        # only when we have the graph for specific decoding batch size can we use the cuda graph for inference
+        use_cuda_graph = False
+        if self.use_cuda_graph and not batch.is_prompts and batch.current_batch_size in self.graph_runners.keys():
+            use_cuda_graph = True
+
+        input_meta_data = InputMetaData(
+            block_tables=batch.get_block_table_tensor(),
+            sequence_lengths=sequence_lengths,
+            fd_inter_tensor=batch.fd_inter_tensor,
+            batch_size=batch.current_batch_size,
+            is_prompts=batch.is_prompts,
+            use_cuda_kernel=self.inference_config.use_cuda_kernel,
+            use_cuda_graph=use_cuda_graph,
+            high_precision=self.high_precision,
+            kv_seq_len=sequence_lengths.max().item(),
+            head_dim=batch.head_dim,
+            dtype=batch.dtype,
+            use_spec_dec=batch.use_spec_dec,
+            num_tokens_to_verify=batch.num_tokens_to_verify,
+            batch_token_ids=batch_token_ids,
+        )
+
+        return input_ids, output_tensor, input_meta_data
+
+    def step(self) -> List[str]:
+        """
+        In each step, do the follows:
+            1. Run RequestHandler.schedule() and get the batch used for inference.
+            2. Get the input, inputinfo and output placeholder from the batchbucket
+            3. Run model to generate the next token
+            4. Update waiting list and running list in RequestHandler and get finished sequences.
+            5. Decode and return finished sequences.
+
+        Returns:
+            List[str]: Decoded finished sequences generated by one step.
+        """
+
+        batch = self.request_handler.schedule()
+
+        input_token_ids, output_tensor, input_meta_data = self.prepare_input(batch)
+
+        if input_meta_data.use_cuda_graph:
+            model_executable = self.graph_runners[input_meta_data.batch_size]
+        else:
+            model_executable = self.model
+
+        # TODO: padding_id is used for generating attn_mask and will be removed if nopad version is supported.
+        logits = model_executable(input_token_ids, output_tensor, input_meta_data, self.k_cache, self.v_cache)
+        if self.inference_config.pad_input:
+            logits = logits[:, -1, :]
+
+        if self.inference_config.enable_streamingllm:
+            updated_block_ids = batch.streamingllm_update_batch(
+                self.inference_config.start_token_size, self.inference_config.generated_token_size
+            )
+            self.request_handler.streamingllm_free_block_tables(updated_block_ids)
+
+        next_tokens = search_tokens(
+            self.generation_config, logits, input_meta_data.is_prompts, batch_token_ids=input_meta_data.batch_token_ids
+        )
+        self.request_handler.append_next_tokens(next_tokens)
+        finished_sequences = self.request_handler.update()
+
+        return finished_sequences
diff --git a/colossalai/inference/core/request_handler.py b/colossalai/inference/core/request_handler.py
index 512eaea71c7b..393347c31e16 100644
--- a/colossalai/inference/core/request_handler.py
+++ b/colossalai/inference/core/request_handler.py
@@ -8,7 +8,7 @@
 from colossalai.inference.config import InferenceConfig
 from colossalai.inference.flash_decoding_utils import FDIntermTensors
 from colossalai.inference.kv_cache import KVCacheManager, RPCKVCacheManager
-from colossalai.inference.struct import RequestStatus, Sequence
+from colossalai.inference.struct import DiffusionSequence, RequestStatus, Sequence
 from colossalai.logging import get_dist_logger
 
 logger = get_dist_logger(__name__)
@@ -98,7 +98,46 @@ def move_prefill_to_decoding(self, seq_ids: List[int]) -> None:
             self._decoding[seq_id] = self._prefill.pop(seq_id)
 
 
-class RequestHandler:
+class NaiveRequestHandler:
+    def __init__(self) -> None:
+        self.running_list: List[DiffusionSequence] = []
+        self.waiting_list: List[str] = []
+
+    def _has_waiting(self) -> bool:
+        return any(lst for lst in self.waiting_list)
+
+    def _has_running(self) -> bool:
+        return any(lst for lst in self.running_list)
+
+    def check_unfinished_reqs(self):
+        return self._has_waiting() or self._has_running()
+
+    def add_sequence(self, seq: DiffusionSequence):
+        """
+        Add the request to waiting list.
+        """
+        assert not self._find_sequence(seq.request_id), f"Sequence {seq.request_id} already exists."
+        self.waiting_list.append(seq)
+
+    def _find_sequence(self, request_id: int) -> DiffusionSequence:
+        """
+        Find the request by request_id.
+        """
+        for lst in enumerate(self.waiting_list + self.running_list):
+            for seq in lst:
+                if seq.request_id == request_id:
+                    return seq
+        return None
+
+    def schedule(self):
+        ret = None
+        if self._has_waiting:
+            ret = self.waiting_list[0]
+            self.waiting_list = self.waiting_list[1:]
+        return ret
+
+
+class RequestHandler(NaiveRequestHandler):
     """
     RequestHandler is the core for handling existing requests and updating current batch.
     During generation process, we call schedule function each iteration to update current batch.
@@ -176,12 +215,12 @@ def __init__(self, inference_config: InferenceConfig, model_config: PretrainedCo
             generated_token_size=inference_config.generated_token_size,
         )
 
+    def _has_running(self) -> bool:
+        return not self.running_bb.is_empty()
+
     def _init_cache(self, model_config):
         self.cache_manager = KVCacheManager(self.inference_config, model_config)
 
-    def _has_waiting(self) -> bool:
-        return any(lst for lst in self.waiting_list)
-
     def get_kvcache(self):
         return self.cache_manager.get_kv_cache()
 
@@ -318,7 +357,7 @@ def update_batch_finished(self, batch: BatchBucket, generation_config: Generatio
             if seq.output_token_id[-1] == generation_config.eos_token_id or seq.output_len >= max_new_tokens:
                 seq.mark_finished()
 
-    def check_unfinished_seqs(self) -> bool:
+    def check_unfinished_reqs(self) -> bool:
         return self._has_waiting() or not self.running_list.is_empty()
 
     def total_requests_in_batch_bucket(self) -> int:
diff --git a/colossalai/inference/modeling/layers/diffusion.py b/colossalai/inference/modeling/layers/diffusion.py
new file mode 100644
index 000000000000..9dc90733d82a
--- /dev/null
+++ b/colossalai/inference/modeling/layers/diffusion.py
@@ -0,0 +1,54 @@
+import inspect
+import types
+
+import torch
+from torch import nn
+
+
+class DiffusionPipe(nn.Module):
+    """
+    This Class convert a class of `DiffusionPipeline` into `nn.Module` and reserve most of origin attr,function and property.
+    """
+
+    def __init__(self, source_obj) -> None:
+        super(DiffusionPipe, self).__init__()
+
+        for k, v in source_obj.__dict__.items():
+            if isinstance(v, nn.Module):
+                self.add_module(k, v)
+            else:
+                setattr(self, k, v)
+
+        skip_list = ["_execution_device", "to", "device"]  # this
+
+        for name, member in inspect.getmembers(source_obj.__class__):
+            if name in skip_list:
+                continue
+            if not name.startswith("__") and not name.endswith("__"):
+                if isinstance(member, property):
+                    setattr(self.__class__, name, member)
+                elif inspect.isfunction(member) or inspect.ismethod(member):
+                    bound_method = types.MethodType(member, self)
+                    setattr(self, name, bound_method)
+                elif not callable(member) and not isinstance(member, property):
+                    setattr(self, name, member)
+            elif name == "__call__":
+                bound_method = types.MethodType(member, self)
+                setattr(self, "_forward", bound_method)
+
+    @property
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        [`~DiffusionPipeline.enable_sequential_cpu_offload`] the execution device can only be inferred from
+        Accelerate's module hooks.
+        """
+        # return self.device
+        return torch.device("cuda")
+
+    @property
+    def device(self):
+        next(self.parameters()).device
+
+    def forward(self, *args, **kwargs):
+        return self._forward(*args, **kwargs)
diff --git a/colossalai/inference/modeling/layers/distrifusion.py b/colossalai/inference/modeling/layers/distrifusion.py
new file mode 100644
index 000000000000..ea97cceefac9
--- /dev/null
+++ b/colossalai/inference/modeling/layers/distrifusion.py
@@ -0,0 +1,626 @@
+# Code refer and adapted from:
+# https://github.com/huggingface/diffusers/blob/v0.29.0-release/src/diffusers
+# https://github.com/PipeFusion/PipeFusion
+
+import inspect
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from diffusers.models import attention_processor
+from diffusers.models.attention import Attention
+from diffusers.models.embeddings import PatchEmbed, get_2d_sincos_pos_embed
+from diffusers.models.transformers.pixart_transformer_2d import PixArtTransformer2DModel
+from diffusers.models.transformers.transformer_sd3 import SD3Transformer2DModel
+from torch import nn
+from torch.distributed import ProcessGroup
+
+from colossalai.inference.config import ModelShardInferenceConfig
+from colossalai.logging import get_dist_logger
+from colossalai.shardformer.layer.parallel_module import ParallelModule
+from colossalai.utils import get_current_device
+
+try:
+    from flash_attn import flash_attn_func
+
+    HAS_FLASH_ATTN = True
+except ImportError:
+    HAS_FLASH_ATTN = False
+
+
+logger = get_dist_logger(__name__)
+
+
+# adapted from https://github.com/huggingface/diffusers/blob/v0.29.0-release/src/diffusers/models/transformers/transformer_2d.py
+def PixArtAlphaTransformer2DModel_forward(
+    self: PixArtTransformer2DModel,
+    hidden_states: torch.Tensor,
+    encoder_hidden_states: Optional[torch.Tensor] = None,
+    timestep: Optional[torch.LongTensor] = None,
+    added_cond_kwargs: Dict[str, torch.Tensor] = None,
+    class_labels: Optional[torch.LongTensor] = None,
+    cross_attention_kwargs: Dict[str, Any] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    encoder_attention_mask: Optional[torch.Tensor] = None,
+    return_dict: bool = True,
+):
+    assert hasattr(
+        self, "patched_parallel_size"
+    ), "please check your policy, `Transformer2DModel` Must have attribute `patched_parallel_size`"
+
+    if cross_attention_kwargs is not None:
+        if cross_attention_kwargs.get("scale", None) is not None:
+            logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+    # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+    #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+    #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+    # expects mask of shape:
+    #   [batch, key_tokens]
+    # adds singleton query_tokens dimension:
+    #   [batch,                    1, key_tokens]
+    # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+    #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+    #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+    if attention_mask is not None and attention_mask.ndim == 2:
+        # assume that mask is expressed as:
+        #   (1 = keep,      0 = discard)
+        # convert mask into a bias that can be added to attention scores:
+        #       (keep = +0,     discard = -10000.0)
+        attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+        attention_mask = attention_mask.unsqueeze(1)
+
+    # convert encoder_attention_mask to a bias the same way we do for attention_mask
+    if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+        encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+        encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+    # 1. Input
+    batch_size = hidden_states.shape[0]
+    height, width = (
+        hidden_states.shape[-2] // self.config.patch_size,
+        hidden_states.shape[-1] // self.config.patch_size,
+    )
+    hidden_states = self.pos_embed(hidden_states)
+
+    timestep, embedded_timestep = self.adaln_single(
+        timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_states.dtype
+    )
+
+    if self.caption_projection is not None:
+        encoder_hidden_states = self.caption_projection(encoder_hidden_states)
+        encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
+
+    # 2. Blocks
+    for block in self.transformer_blocks:
+        hidden_states = block(
+            hidden_states,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            timestep=timestep,
+            cross_attention_kwargs=cross_attention_kwargs,
+            class_labels=class_labels,
+        )
+
+    # 3. Output
+    shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None].to(self.scale_shift_table.device)).chunk(
+        2, dim=1
+    )
+    hidden_states = self.norm_out(hidden_states)
+    # Modulation
+    hidden_states = hidden_states * (1 + scale.to(hidden_states.device)) + shift.to(hidden_states.device)
+    hidden_states = self.proj_out(hidden_states)
+    hidden_states = hidden_states.squeeze(1)
+
+    # unpatchify
+    hidden_states = hidden_states.reshape(
+        shape=(
+            -1,
+            height // self.patched_parallel_size,
+            width,
+            self.config.patch_size,
+            self.config.patch_size,
+            self.out_channels,
+        )
+    )
+    hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+    output = hidden_states.reshape(
+        shape=(
+            -1,
+            self.out_channels,
+            height // self.patched_parallel_size * self.config.patch_size,
+            width * self.config.patch_size,
+        )
+    )
+
+    # enable Distrifusion Optimization
+    if hasattr(self, "patched_parallel_size"):
+        from torch import distributed as dist
+
+        if (getattr(self, "output_buffer", None) is None) or (self.output_buffer.shape != output.shape):
+            self.output_buffer = torch.empty_like(output)
+        if (getattr(self, "buffer_list", None) is None) or (self.buffer_list[0].shape != output.shape):
+            self.buffer_list = [torch.empty_like(output) for _ in range(self.patched_parallel_size)]
+        output = output.contiguous()
+        dist.all_gather(self.buffer_list, output, async_op=False)
+        torch.cat(self.buffer_list, dim=2, out=self.output_buffer)
+        output = self.output_buffer
+
+    return (output,)
+
+
+# adapted from https://github.com/huggingface/diffusers/blob/v0.29.0-release/src/diffusers/models/transformers/transformer_sd3.py
+def SD3Transformer2DModel_forward(
+    self: SD3Transformer2DModel,
+    hidden_states: torch.FloatTensor,
+    encoder_hidden_states: torch.FloatTensor = None,
+    pooled_projections: torch.FloatTensor = None,
+    timestep: torch.LongTensor = None,
+    joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    return_dict: bool = True,
+) -> Union[torch.FloatTensor]:
+
+    assert hasattr(
+        self, "patched_parallel_size"
+    ), "please check your policy, `Transformer2DModel` Must have attribute `patched_parallel_size`"
+
+    height, width = hidden_states.shape[-2:]
+
+    hidden_states = self.pos_embed(hidden_states)  # takes care of adding positional embeddings too.
+    temb = self.time_text_embed(timestep, pooled_projections)
+    encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+
+    for block in self.transformer_blocks:
+        encoder_hidden_states, hidden_states = block(
+            hidden_states=hidden_states, encoder_hidden_states=encoder_hidden_states, temb=temb
+        )
+
+    hidden_states = self.norm_out(hidden_states, temb)
+    hidden_states = self.proj_out(hidden_states)
+
+    # unpatchify
+    patch_size = self.config.patch_size
+    height = height // patch_size // self.patched_parallel_size
+    width = width // patch_size
+
+    hidden_states = hidden_states.reshape(
+        shape=(hidden_states.shape[0], height, width, patch_size, patch_size, self.out_channels)
+    )
+    hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+    output = hidden_states.reshape(
+        shape=(hidden_states.shape[0], self.out_channels, height * patch_size, width * patch_size)
+    )
+
+    # enable Distrifusion Optimization
+    if hasattr(self, "patched_parallel_size"):
+        from torch import distributed as dist
+
+        if (getattr(self, "output_buffer", None) is None) or (self.output_buffer.shape != output.shape):
+            self.output_buffer = torch.empty_like(output)
+        if (getattr(self, "buffer_list", None) is None) or (self.buffer_list[0].shape != output.shape):
+            self.buffer_list = [torch.empty_like(output) for _ in range(self.patched_parallel_size)]
+        output = output.contiguous()
+        dist.all_gather(self.buffer_list, output, async_op=False)
+        torch.cat(self.buffer_list, dim=2, out=self.output_buffer)
+        output = self.output_buffer
+
+    return (output,)
+
+
+# Code adapted from: https://github.com/PipeFusion/PipeFusion/blob/main/pipefuser/modules/dit/patch_parallel/patchembed.py
+class DistrifusionPatchEmbed(ParallelModule):
+    def __init__(
+        self,
+        module: PatchEmbed,
+        process_group: Union[ProcessGroup, List[ProcessGroup]],
+        model_shard_infer_config: ModelShardInferenceConfig = None,
+    ):
+        super().__init__()
+        self.module = module
+        self.rank = dist.get_rank(group=process_group)
+        self.patched_parallelism_size = model_shard_infer_config.patched_parallelism_size
+
+    @staticmethod
+    def from_native_module(module: PatchEmbed, process_group: Union[ProcessGroup, List[ProcessGroup]], *args, **kwargs):
+        model_shard_infer_config = kwargs.get("model_shard_infer_config", None)
+        distrifusion_embed = DistrifusionPatchEmbed(
+            module, process_group, model_shard_infer_config=model_shard_infer_config
+        )
+        return distrifusion_embed
+
+    def forward(self, latent):
+        module = self.module
+        if module.pos_embed_max_size is not None:
+            height, width = latent.shape[-2:]
+        else:
+            height, width = latent.shape[-2] // module.patch_size, latent.shape[-1] // module.patch_size
+
+        latent = module.proj(latent)
+        if module.flatten:
+            latent = latent.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        if module.layer_norm:
+            latent = module.norm(latent)
+        if module.pos_embed is None:
+            return latent.to(latent.dtype)
+        # Interpolate or crop positional embeddings as needed
+        if module.pos_embed_max_size:
+            pos_embed = module.cropped_pos_embed(height, width)
+        else:
+            if module.height != height or module.width != width:
+                pos_embed = get_2d_sincos_pos_embed(
+                    embed_dim=module.pos_embed.shape[-1],
+                    grid_size=(height, width),
+                    base_size=module.base_size,
+                    interpolation_scale=module.interpolation_scale,
+                )
+                pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).to(latent.device)
+            else:
+                pos_embed = module.pos_embed
+
+        b, c, h = pos_embed.shape
+        pos_embed = pos_embed.view(b, self.patched_parallelism_size, -1, h)[:, self.rank]
+
+        return (latent + pos_embed).to(latent.dtype)
+
+
+# Code adapted from: https://github.com/PipeFusion/PipeFusion/blob/main/pipefuser/modules/dit/patch_parallel/conv2d.py
+class DistrifusionConv2D(ParallelModule):
+
+    def __init__(
+        self,
+        module: nn.Conv2d,
+        process_group: Union[ProcessGroup, List[ProcessGroup]],
+        model_shard_infer_config: ModelShardInferenceConfig = None,
+    ):
+        super().__init__()
+        self.module = module
+        self.rank = dist.get_rank(group=process_group)
+        self.patched_parallelism_size = model_shard_infer_config.patched_parallelism_size
+
+    @staticmethod
+    def from_native_module(module: nn.Conv2d, process_group: Union[ProcessGroup, List[ProcessGroup]], *args, **kwargs):
+        model_shard_infer_config = kwargs.get("model_shard_infer_config", None)
+        distrifusion_conv = DistrifusionConv2D(module, process_group, model_shard_infer_config=model_shard_infer_config)
+        return distrifusion_conv
+
+    def sliced_forward(self, x: torch.Tensor) -> torch.Tensor:
+
+        b, c, h, w = x.shape
+
+        stride = self.module.stride[0]
+        padding = self.module.padding[0]
+
+        output_h = x.shape[2] // stride // self.patched_parallelism_size
+        idx = dist.get_rank()
+        h_begin = output_h * idx * stride - padding
+        h_end = output_h * (idx + 1) * stride + padding
+        final_padding = [padding, padding, 0, 0]
+        if h_begin < 0:
+            h_begin = 0
+            final_padding[2] = padding
+        if h_end > h:
+            h_end = h
+            final_padding[3] = padding
+        sliced_input = x[:, :, h_begin:h_end, :]
+        padded_input = F.pad(sliced_input, final_padding, mode="constant")
+        return F.conv2d(
+            padded_input,
+            self.module.weight,
+            self.module.bias,
+            stride=stride,
+            padding="valid",
+        )
+
+    def forward(self, input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        output = self.sliced_forward(input)
+        return output
+
+
+# Code adapted from: https://github.com/huggingface/diffusers/blob/v0.29.0-release/src/diffusers/models/attention_processor.py
+class DistrifusionFusedAttention(ParallelModule):
+
+    def __init__(
+        self,
+        module: attention_processor.Attention,
+        process_group: Union[ProcessGroup, List[ProcessGroup]],
+        model_shard_infer_config: ModelShardInferenceConfig = None,
+    ):
+        super().__init__()
+        self.counter = 0
+        self.module = module
+        self.buffer_list = None
+        self.kv_buffer_idx = dist.get_rank(group=process_group)
+        self.patched_parallelism_size = model_shard_infer_config.patched_parallelism_size
+        self.handle = None
+        self.process_group = process_group
+        self.warm_step = 5  # for warmup
+
+    @staticmethod
+    def from_native_module(
+        module: attention_processor.Attention, process_group: Union[ProcessGroup, List[ProcessGroup]], *args, **kwargs
+    ) -> ParallelModule:
+        model_shard_infer_config = kwargs.get("model_shard_infer_config", None)
+        return DistrifusionFusedAttention(
+            module=module,
+            process_group=process_group,
+            model_shard_infer_config=model_shard_infer_config,
+        )
+
+    def _forward(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        context_input_ndim = encoder_hidden_states.ndim
+        if context_input_ndim == 4:
+            batch_size, channel, height, width = encoder_hidden_states.shape
+            encoder_hidden_states = encoder_hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+
+        batch_size = encoder_hidden_states.shape[0]
+
+        # `sample` projections.
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+
+        kv = torch.cat([key, value], dim=-1)  # shape of kv now: (bs, seq_len // parallel_size, dim * 2)
+
+        if self.patched_parallelism_size == 1:
+            full_kv = kv
+        else:
+            if self.buffer_list is None:  # buffer not created
+                full_kv = torch.cat([kv for _ in range(self.patched_parallelism_size)], dim=1)
+            elif self.counter <= self.warm_step:
+                # logger.info(f"warmup: {self.counter}")
+                dist.all_gather(
+                    self.buffer_list,
+                    kv,
+                    group=self.process_group,
+                    async_op=False,
+                )
+                full_kv = torch.cat(self.buffer_list, dim=1)
+            else:
+                # logger.info(f"use old kv to infer: {self.counter}")
+                self.buffer_list[self.kv_buffer_idx].copy_(kv)
+                full_kv = torch.cat(self.buffer_list, dim=1)
+                assert self.handle is None, "we should maintain the kv of last step"
+                self.handle = dist.all_gather(self.buffer_list, kv, group=self.process_group, async_op=True)
+
+        key, value = torch.split(full_kv, full_kv.shape[-1] // 2, dim=-1)
+
+        # `context` projections.
+        encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+
+        # attention
+        query = torch.cat([query, encoder_hidden_states_query_proj], dim=1)
+        key = torch.cat([key, encoder_hidden_states_key_proj], dim=1)
+        value = torch.cat([value, encoder_hidden_states_value_proj], dim=1)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        hidden_states = hidden_states = F.scaled_dot_product_attention(
+            query, key, value, dropout_p=0.0, is_causal=False
+        )  # NOTE(@lry89757) for torch >= 2.2, flash attn has been already integrated into scaled_dot_product_attention, https://pytorch.org/blog/pytorch2-2/
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+
+        # Split the attention outputs.
+        hidden_states, encoder_hidden_states = (
+            hidden_states[:, : residual.shape[1]],
+            hidden_states[:, residual.shape[1] :],
+        )
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if not attn.context_pre_only:
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if context_input_ndim == 4:
+            encoder_hidden_states = encoder_hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+
+        return hidden_states, encoder_hidden_states
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **cross_attention_kwargs,
+    ) -> torch.Tensor:
+
+        if self.handle is not None:
+            self.handle.wait()
+            self.handle = None
+
+        b, l, c = hidden_states.shape
+        kv_shape = (b, l, self.module.to_k.out_features * 2)
+        if self.patched_parallelism_size > 1 and (self.buffer_list is None or self.buffer_list[0].shape != kv_shape):
+
+            self.buffer_list = [
+                torch.empty(kv_shape, dtype=hidden_states.dtype, device=get_current_device())
+                for _ in range(self.patched_parallelism_size)
+            ]
+
+            self.counter = 0
+
+        attn_parameters = set(inspect.signature(self.module.processor.__call__).parameters.keys())
+        quiet_attn_parameters = {"ip_adapter_masks"}
+        unused_kwargs = [
+            k for k, _ in cross_attention_kwargs.items() if k not in attn_parameters and k not in quiet_attn_parameters
+        ]
+        if len(unused_kwargs) > 0:
+            logger.warning(
+                f"cross_attention_kwargs {unused_kwargs} are not expected by {self.module.processor.__class__.__name__} and will be ignored."
+            )
+        cross_attention_kwargs = {k: w for k, w in cross_attention_kwargs.items() if k in attn_parameters}
+
+        output = self._forward(
+            self.module,
+            hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+
+        self.counter += 1
+
+        return output
+
+
+# Code adapted from: https://github.com/PipeFusion/PipeFusion/blob/main/pipefuser/modules/dit/patch_parallel/attn.py
+class DistriSelfAttention(ParallelModule):
+    def __init__(
+        self,
+        module: Attention,
+        process_group: Union[ProcessGroup, List[ProcessGroup]],
+        model_shard_infer_config: ModelShardInferenceConfig = None,
+    ):
+        super().__init__()
+        self.counter = 0
+        self.module = module
+        self.buffer_list = None
+        self.kv_buffer_idx = dist.get_rank(group=process_group)
+        self.patched_parallelism_size = model_shard_infer_config.patched_parallelism_size
+        self.handle = None
+        self.process_group = process_group
+        self.warm_step = 3  # for warmup
+
+    @staticmethod
+    def from_native_module(
+        module: Attention, process_group: Union[ProcessGroup, List[ProcessGroup]], *args, **kwargs
+    ) -> ParallelModule:
+        model_shard_infer_config = kwargs.get("model_shard_infer_config", None)
+        return DistriSelfAttention(
+            module=module,
+            process_group=process_group,
+            model_shard_infer_config=model_shard_infer_config,
+        )
+
+    def _forward(self, hidden_states: torch.FloatTensor, scale: float = 1.0):
+        attn = self.module
+        assert isinstance(attn, Attention)
+
+        residual = hidden_states
+
+        batch_size, sequence_length, _ = hidden_states.shape
+
+        query = attn.to_q(hidden_states)
+
+        encoder_hidden_states = hidden_states
+        k = self.module.to_k(encoder_hidden_states)
+        v = self.module.to_v(encoder_hidden_states)
+        kv = torch.cat([k, v], dim=-1)  # shape of kv now: (bs, seq_len // parallel_size, dim * 2)
+
+        if self.patched_parallelism_size == 1:
+            full_kv = kv
+        else:
+            if self.buffer_list is None:  # buffer not created
+                full_kv = torch.cat([kv for _ in range(self.patched_parallelism_size)], dim=1)
+            elif self.counter <= self.warm_step:
+                # logger.info(f"warmup: {self.counter}")
+                dist.all_gather(
+                    self.buffer_list,
+                    kv,
+                    group=self.process_group,
+                    async_op=False,
+                )
+                full_kv = torch.cat(self.buffer_list, dim=1)
+            else:
+                # logger.info(f"use old kv to infer: {self.counter}")
+                self.buffer_list[self.kv_buffer_idx].copy_(kv)
+                full_kv = torch.cat(self.buffer_list, dim=1)
+                assert self.handle is None, "we should maintain the kv of last step"
+                self.handle = dist.all_gather(self.buffer_list, kv, group=self.process_group, async_op=True)
+
+        if HAS_FLASH_ATTN:
+            # flash attn
+            key, value = torch.split(full_kv, full_kv.shape[-1] // 2, dim=-1)
+            inner_dim = key.shape[-1]
+            head_dim = inner_dim // attn.heads
+
+            query = query.view(batch_size, -1, attn.heads, head_dim)
+            key = key.view(batch_size, -1, attn.heads, head_dim)
+            value = value.view(batch_size, -1, attn.heads, head_dim)
+
+            hidden_states = flash_attn_func(query, key, value, dropout_p=0.0, causal=False)
+            hidden_states = hidden_states.reshape(batch_size, -1, attn.heads * head_dim).to(query.dtype)
+        else:
+            # naive attn
+            key, value = torch.split(full_kv, full_kv.shape[-1] // 2, dim=-1)
+
+            inner_dim = key.shape[-1]
+            head_dim = inner_dim // attn.heads
+
+            query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+            # the output of sdp = (batch, num_heads, seq_len, head_dim)
+            # TODO: add support for attn.scale when we move to Torch 2.1
+            hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
+
+            hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+            hidden_states = hidden_states.to(query.dtype)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+
+        hidden_states = hidden_states / attn.rescale_output_factor
+
+        return hidden_states
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+        *args,
+        **kwargs,
+    ) -> torch.FloatTensor:
+
+        # async preallocates memo buffer
+        if self.handle is not None:
+            self.handle.wait()
+            self.handle = None
+
+        b, l, c = hidden_states.shape
+        kv_shape = (b, l, self.module.to_k.out_features * 2)
+        if self.patched_parallelism_size > 1 and (self.buffer_list is None or self.buffer_list[0].shape != kv_shape):
+
+            self.buffer_list = [
+                torch.empty(kv_shape, dtype=hidden_states.dtype, device=get_current_device())
+                for _ in range(self.patched_parallelism_size)
+            ]
+
+            self.counter = 0
+
+        output = self._forward(hidden_states, scale=scale)
+
+        self.counter += 1
+        return output
diff --git a/colossalai/inference/modeling/models/pixart_alpha.py b/colossalai/inference/modeling/models/pixart_alpha.py
new file mode 100644
index 000000000000..cc2bee5efd4d
--- /dev/null
+++ b/colossalai/inference/modeling/models/pixart_alpha.py
@@ -0,0 +1,220 @@
+# Code adapted from:
+# https://github.com/huggingface/diffusers/blob/v0.29.0-release/src/diffusers/pipelines/pixart_alpha/pipeline_pixart_alpha.py
+
+from typing import Callable, List, Optional, Union
+
+import PIL.Image
+import torch
+from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha import (
+    ASPECT_RATIO_256_BIN,
+    ASPECT_RATIO_512_BIN,
+    ASPECT_RATIO_1024_BIN,
+)
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import retrieve_timesteps
+
+from colossalai.logging import get_dist_logger
+
+from ..layers.diffusion import DiffusionPipe
+
+logger = get_dist_logger(__name__)
+
+
+@torch.no_grad()
+def pixart_alpha_forward(
+    self: DiffusionPipe,
+    prompt: Union[str, List[str]] = None,
+    negative_prompt: str = "",
+    num_inference_steps: int = 20,
+    timesteps: List[int] = None,
+    sigmas: List[float] = None,
+    guidance_scale: float = 4.5,
+    num_images_per_prompt: Optional[int] = 1,
+    height: Optional[int] = None,
+    width: Optional[int] = None,
+    eta: float = 0.0,
+    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    latents: Optional[torch.Tensor] = None,
+    prompt_embeds: Optional[torch.Tensor] = None,
+    prompt_attention_mask: Optional[torch.Tensor] = None,
+    negative_prompt_embeds: Optional[torch.Tensor] = None,
+    negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+    output_type: Optional[str] = "pil",
+    return_dict: bool = True,
+    callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
+    callback_steps: int = 1,
+    clean_caption: bool = True,
+    use_resolution_binning: bool = True,
+    max_sequence_length: int = 120,
+    **kwargs,
+) -> PIL.Image:
+    # 1. Check inputs. Raise error if not correct
+    height = height or self.transformer.config.sample_size * self.vae_scale_factor
+    width = width or self.transformer.config.sample_size * self.vae_scale_factor
+    if use_resolution_binning:
+        if self.transformer.config.sample_size == 128:
+            aspect_ratio_bin = ASPECT_RATIO_1024_BIN
+        elif self.transformer.config.sample_size == 64:
+            aspect_ratio_bin = ASPECT_RATIO_512_BIN
+        elif self.transformer.config.sample_size == 32:
+            aspect_ratio_bin = ASPECT_RATIO_256_BIN
+        else:
+            raise ValueError("Invalid sample size")
+        orig_height, orig_width = height, width
+        height, width = self.image_processor.classify_height_width_bin(height, width, ratios=aspect_ratio_bin)
+
+    self.check_inputs(
+        prompt,
+        height,
+        width,
+        negative_prompt,
+        callback_steps,
+        prompt_embeds,
+        negative_prompt_embeds,
+        prompt_attention_mask,
+        negative_prompt_attention_mask,
+    )
+
+    # 2. Default height and width to transformer
+    if prompt is not None and isinstance(prompt, str):
+        batch_size = 1
+    elif prompt is not None and isinstance(prompt, list):
+        batch_size = len(prompt)
+    else:
+        batch_size = prompt_embeds.shape[0]
+
+    device = self._execution_device
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    do_classifier_free_guidance = guidance_scale > 1.0
+
+    # 3. Encode input prompt
+    (
+        prompt_embeds,
+        prompt_attention_mask,
+        negative_prompt_embeds,
+        negative_prompt_attention_mask,
+    ) = self.encode_prompt(
+        prompt,
+        do_classifier_free_guidance,
+        negative_prompt=negative_prompt,
+        num_images_per_prompt=num_images_per_prompt,
+        device=device,
+        prompt_embeds=prompt_embeds,
+        negative_prompt_embeds=negative_prompt_embeds,
+        prompt_attention_mask=prompt_attention_mask,
+        negative_prompt_attention_mask=negative_prompt_attention_mask,
+        clean_caption=clean_caption,
+        max_sequence_length=max_sequence_length,
+    )
+    if do_classifier_free_guidance:
+        prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+        prompt_attention_mask = torch.cat([negative_prompt_attention_mask, prompt_attention_mask], dim=0)
+
+    # 4. Prepare timesteps
+    timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps, sigmas)
+
+    # 5. Prepare latents.
+    latent_channels = self.transformer.config.in_channels
+    latents = self.prepare_latents(
+        batch_size * num_images_per_prompt,
+        latent_channels,
+        height,
+        width,
+        prompt_embeds.dtype,
+        device,
+        generator,
+        latents,
+    )
+
+    # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+    extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+    # 6.1 Prepare micro-conditions.
+    added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
+    if self.transformer.config.sample_size == 128:
+        resolution = torch.tensor([height, width]).repeat(batch_size * num_images_per_prompt, 1)
+        aspect_ratio = torch.tensor([float(height / width)]).repeat(batch_size * num_images_per_prompt, 1)
+        resolution = resolution.to(dtype=prompt_embeds.dtype, device=device)
+        aspect_ratio = aspect_ratio.to(dtype=prompt_embeds.dtype, device=device)
+
+        if do_classifier_free_guidance:
+            resolution = torch.cat([resolution, resolution], dim=0)
+            aspect_ratio = torch.cat([aspect_ratio, aspect_ratio], dim=0)
+
+        added_cond_kwargs = {"resolution": resolution, "aspect_ratio": aspect_ratio}
+
+    # 7. Denoising loop
+    num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+    with self.progress_bar(total=num_inference_steps) as progress_bar:
+        for i, t in enumerate(timesteps):
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+            current_timestep = t
+            if not torch.is_tensor(current_timestep):
+                # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+                # This would be a good case for the `match` statement (Python 3.10+)
+                is_mps = latent_model_input.device.type == "mps"
+                if isinstance(current_timestep, float):
+                    dtype = torch.float32 if is_mps else torch.float64
+                else:
+                    dtype = torch.int32 if is_mps else torch.int64
+                current_timestep = torch.tensor([current_timestep], dtype=dtype, device=latent_model_input.device)
+            elif len(current_timestep.shape) == 0:
+                current_timestep = current_timestep[None].to(latent_model_input.device)
+            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            current_timestep = current_timestep.expand(latent_model_input.shape[0])
+
+            # predict noise model_output
+            noise_pred = self.transformer(
+                latent_model_input,
+                encoder_hidden_states=prompt_embeds,
+                encoder_attention_mask=prompt_attention_mask,
+                timestep=current_timestep,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # learned sigma
+            if self.transformer.config.out_channels // 2 == latent_channels:
+                noise_pred = noise_pred.chunk(2, dim=1)[0]
+            else:
+                noise_pred = noise_pred
+
+            # compute previous image: x_t -> x_t-1
+            if num_inference_steps == 1:
+                # For DMD one step sampling: https://arxiv.org/abs/2311.18828
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).pred_original_sample
+            else:
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                progress_bar.update()
+                if callback is not None and i % callback_steps == 0:
+                    step_idx = i // getattr(self.scheduler, "order", 1)
+                    callback(step_idx, t, latents)
+
+    output_type = "pil"  # TODO(@lry89757) temporarily image, please support more return output
+    if not output_type == "latent":
+        image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        if use_resolution_binning:
+            image = self.image_processor.resize_and_crop_tensor(image, orig_width, orig_height)
+    else:
+        image = latents
+
+    if not output_type == "latent":
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+    # Offload all models
+    # self.maybe_free_model_hooks()
+
+    return image
diff --git a/colossalai/inference/modeling/models/stablediffusion3.py b/colossalai/inference/modeling/models/stablediffusion3.py
new file mode 100644
index 000000000000..b123164039c8
--- /dev/null
+++ b/colossalai/inference/modeling/models/stablediffusion3.py
@@ -0,0 +1,178 @@
+# This code is adapted from huggingface diffusers: https://github.com/huggingface/diffusers/blob/v0.29.0-release/src/diffusers/pipelines/stable_diffusion_3/pipeline_stable_diffusion_3.py
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import torch
+from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import retrieve_timesteps
+
+from ..layers.diffusion import DiffusionPipe
+
+
+# TODO(@lry89757) temporarily image, please support more return output
+@torch.no_grad()
+def sd3_forward(
+    self: DiffusionPipe,
+    prompt: Union[str, List[str]] = None,
+    prompt_2: Optional[Union[str, List[str]]] = None,
+    prompt_3: Optional[Union[str, List[str]]] = None,
+    height: Optional[int] = None,
+    width: Optional[int] = None,
+    num_inference_steps: int = 28,
+    timesteps: List[int] = None,
+    guidance_scale: float = 7.0,
+    negative_prompt: Optional[Union[str, List[str]]] = None,
+    negative_prompt_2: Optional[Union[str, List[str]]] = None,
+    negative_prompt_3: Optional[Union[str, List[str]]] = None,
+    num_images_per_prompt: Optional[int] = 1,
+    generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+    latents: Optional[torch.FloatTensor] = None,
+    prompt_embeds: Optional[torch.FloatTensor] = None,
+    negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+    pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+    output_type: Optional[str] = "pil",
+    return_dict: bool = True,
+    joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    clip_skip: Optional[int] = None,
+    callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+    callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+):
+    height = height or self.default_sample_size * self.vae_scale_factor
+    width = width or self.default_sample_size * self.vae_scale_factor
+
+    # 1. Check inputs. Raise error if not correct
+    self.check_inputs(
+        prompt,
+        prompt_2,
+        prompt_3,
+        height,
+        width,
+        negative_prompt=negative_prompt,
+        negative_prompt_2=negative_prompt_2,
+        negative_prompt_3=negative_prompt_3,
+        prompt_embeds=prompt_embeds,
+        negative_prompt_embeds=negative_prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+    )
+
+    self._guidance_scale = guidance_scale
+    self._clip_skip = clip_skip
+    self._joint_attention_kwargs = joint_attention_kwargs
+    self._interrupt = False
+
+    # 2. Define call parameters
+    if prompt is not None and isinstance(prompt, str):
+        batch_size = 1
+    elif prompt is not None and isinstance(prompt, list):
+        batch_size = len(prompt)
+    else:
+        batch_size = prompt_embeds.shape[0]
+
+    device = self._execution_device
+
+    (
+        prompt_embeds,
+        negative_prompt_embeds,
+        pooled_prompt_embeds,
+        negative_pooled_prompt_embeds,
+    ) = self.encode_prompt(
+        prompt=prompt,
+        prompt_2=prompt_2,
+        prompt_3=prompt_3,
+        negative_prompt=negative_prompt,
+        negative_prompt_2=negative_prompt_2,
+        negative_prompt_3=negative_prompt_3,
+        do_classifier_free_guidance=self.do_classifier_free_guidance,
+        prompt_embeds=prompt_embeds,
+        negative_prompt_embeds=negative_prompt_embeds,
+        pooled_prompt_embeds=pooled_prompt_embeds,
+        negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+        device=device,
+        clip_skip=self.clip_skip,
+        num_images_per_prompt=num_images_per_prompt,
+    )
+
+    if self.do_classifier_free_guidance:
+        prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+        pooled_prompt_embeds = torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds], dim=0)
+
+    # 4. Prepare timesteps
+    timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+    num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+    self._num_timesteps = len(timesteps)
+
+    # 5. Prepare latent variables
+    num_channels_latents = self.transformer.config.in_channels
+    latents = self.prepare_latents(
+        batch_size * num_images_per_prompt,
+        num_channels_latents,
+        height,
+        width,
+        prompt_embeds.dtype,
+        device,
+        generator,
+        latents,
+    )
+
+    # 6. Denoising loop
+    with self.progress_bar(total=num_inference_steps) as progress_bar:
+        for i, t in enumerate(timesteps):
+            if self.interrupt:
+                continue
+
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+            # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+            timestep = t.expand(latent_model_input.shape[0])
+
+            noise_pred = self.transformer(
+                hidden_states=latent_model_input,
+                timestep=timestep,
+                encoder_hidden_states=prompt_embeds,
+                pooled_projections=pooled_prompt_embeds,
+                joint_attention_kwargs=self.joint_attention_kwargs,
+                return_dict=False,
+            )[0]
+
+            # perform guidance
+            if self.do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            latents_dtype = latents.dtype
+            latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+            if latents.dtype != latents_dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    latents = latents.to(latents_dtype)
+
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                latents = callback_outputs.pop("latents", latents)
+                prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                negative_pooled_prompt_embeds = callback_outputs.pop(
+                    "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                )
+
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                progress_bar.update()
+
+    if output_type == "latent":
+        image = latents
+
+    else:
+        latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+    return image
diff --git a/colossalai/inference/modeling/policy/__init__.py b/colossalai/inference/modeling/policy/__init__.py
index fa03955907fe..02ffadd9f6b0 100644
--- a/colossalai/inference/modeling/policy/__init__.py
+++ b/colossalai/inference/modeling/policy/__init__.py
@@ -1,16 +1,22 @@
 from .glide_llama import GlideLlamaModelPolicy
 from .nopadding_baichuan import NoPaddingBaichuanModelInferPolicy
 from .nopadding_llama import NoPaddingLlamaModelInferPolicy
+from .pixart_alpha import PixArtAlphaInferPolicy
+from .stablediffusion3 import StableDiffusion3InferPolicy
 
 model_policy_map = {
     "nopadding_llama": NoPaddingLlamaModelInferPolicy,
     "nopadding_baichuan": NoPaddingBaichuanModelInferPolicy,
     "glide_llama": GlideLlamaModelPolicy,
+    "StableDiffusion3Pipeline": StableDiffusion3InferPolicy,
+    "PixArtAlphaPipeline": PixArtAlphaInferPolicy,
 }
 
 __all__ = [
     "NoPaddingLlamaModelInferPolicy",
     "NoPaddingBaichuanModelInferPolicy",
     "GlideLlamaModelPolicy",
+    "StableDiffusion3InferPolicy",
+    "PixArtAlphaInferPolicy",
     "model_polic_map",
 ]
diff --git a/colossalai/inference/modeling/policy/pixart_alpha.py b/colossalai/inference/modeling/policy/pixart_alpha.py
new file mode 100644
index 000000000000..1150b2432cc5
--- /dev/null
+++ b/colossalai/inference/modeling/policy/pixart_alpha.py
@@ -0,0 +1,79 @@
+from diffusers.models.attention import BasicTransformerBlock
+from diffusers.models.transformers.pixart_transformer_2d import PixArtTransformer2DModel
+from torch import nn
+
+from colossalai.inference.config import RPC_PARAM
+from colossalai.inference.modeling.layers.diffusion import DiffusionPipe
+from colossalai.inference.modeling.layers.distrifusion import (
+    DistrifusionConv2D,
+    DistrifusionPatchEmbed,
+    DistriSelfAttention,
+    PixArtAlphaTransformer2DModel_forward,
+)
+from colossalai.inference.modeling.models.pixart_alpha import pixart_alpha_forward
+from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
+
+
+class PixArtAlphaInferPolicy(Policy, RPC_PARAM):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def module_policy(self):
+        policy = {}
+
+        if self.shard_config.extra_kwargs["model_shard_infer_config"].patched_parallelism_size > 1:
+
+            policy[PixArtTransformer2DModel] = ModulePolicyDescription(
+                sub_module_replacement=[
+                    SubModuleReplacementDescription(
+                        suffix="pos_embed.proj",
+                        target_module=DistrifusionConv2D,
+                        kwargs={"model_shard_infer_config": self.shard_config.extra_kwargs["model_shard_infer_config"]},
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="pos_embed",
+                        target_module=DistrifusionPatchEmbed,
+                        kwargs={"model_shard_infer_config": self.shard_config.extra_kwargs["model_shard_infer_config"]},
+                    ),
+                ],
+                attribute_replacement={
+                    "patched_parallel_size": self.shard_config.extra_kwargs[
+                        "model_shard_infer_config"
+                    ].patched_parallelism_size
+                },
+                method_replacement={"forward": PixArtAlphaTransformer2DModel_forward},
+            )
+
+            policy[BasicTransformerBlock] = ModulePolicyDescription(
+                sub_module_replacement=[
+                    SubModuleReplacementDescription(
+                        suffix="attn1",
+                        target_module=DistriSelfAttention,
+                        kwargs={
+                            "model_shard_infer_config": self.shard_config.extra_kwargs["model_shard_infer_config"],
+                        },
+                    )
+                ]
+            )
+
+        self.append_or_create_method_replacement(
+            description={"forward": pixart_alpha_forward}, policy=policy, target_key=DiffusionPipe
+        )
+
+        return policy
+
+    def preprocess(self) -> nn.Module:
+        return self.model
+
+    def postprocess(self):
+        return self.model
+
+    def config_sanity_check(self):
+        pass
+
+    def to_rpc_param(self) -> str:
+        return __class__.__name__
+
+    @staticmethod
+    def from_rpc_param() -> "PixArtAlphaInferPolicy":
+        return PixArtAlphaInferPolicy()
diff --git a/colossalai/inference/modeling/policy/stablediffusion3.py b/colossalai/inference/modeling/policy/stablediffusion3.py
new file mode 100644
index 000000000000..39b764b92887
--- /dev/null
+++ b/colossalai/inference/modeling/policy/stablediffusion3.py
@@ -0,0 +1,78 @@
+from diffusers.models.attention import JointTransformerBlock
+from diffusers.models.transformers import SD3Transformer2DModel
+from torch import nn
+
+from colossalai.inference.config import RPC_PARAM
+from colossalai.inference.modeling.layers.diffusion import DiffusionPipe
+from colossalai.inference.modeling.layers.distrifusion import (
+    DistrifusionConv2D,
+    DistrifusionFusedAttention,
+    DistrifusionPatchEmbed,
+    SD3Transformer2DModel_forward,
+)
+from colossalai.inference.modeling.models.stablediffusion3 import sd3_forward
+from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
+
+
+class StableDiffusion3InferPolicy(Policy, RPC_PARAM):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def module_policy(self):
+        policy = {}
+
+        if self.shard_config.extra_kwargs["model_shard_infer_config"].patched_parallelism_size > 1:
+
+            policy[SD3Transformer2DModel] = ModulePolicyDescription(
+                sub_module_replacement=[
+                    SubModuleReplacementDescription(
+                        suffix="pos_embed.proj",
+                        target_module=DistrifusionConv2D,
+                        kwargs={"model_shard_infer_config": self.shard_config.extra_kwargs["model_shard_infer_config"]},
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="pos_embed",
+                        target_module=DistrifusionPatchEmbed,
+                        kwargs={"model_shard_infer_config": self.shard_config.extra_kwargs["model_shard_infer_config"]},
+                    ),
+                ],
+                attribute_replacement={
+                    "patched_parallel_size": self.shard_config.extra_kwargs[
+                        "model_shard_infer_config"
+                    ].patched_parallelism_size
+                },
+                method_replacement={"forward": SD3Transformer2DModel_forward},
+            )
+
+        policy[JointTransformerBlock] = ModulePolicyDescription(
+            sub_module_replacement=[
+                SubModuleReplacementDescription(
+                    suffix="attn",
+                    target_module=DistrifusionFusedAttention,
+                    kwargs={
+                        "model_shard_infer_config": self.shard_config.extra_kwargs["model_shard_infer_config"],
+                    },
+                )
+            ]
+        )
+
+        self.append_or_create_method_replacement(
+            description={"forward": sd3_forward}, policy=policy, target_key=DiffusionPipe
+        )
+        return policy
+
+    def preprocess(self) -> nn.Module:
+        return self.model
+
+    def postprocess(self):
+        return self.model
+
+    def config_sanity_check(self):
+        pass
+
+    def to_rpc_param(self) -> str:
+        return __class__.__name__
+
+    @staticmethod
+    def from_rpc_param() -> "StableDiffusion3InferPolicy":
+        return StableDiffusion3InferPolicy()
diff --git a/colossalai/inference/struct.py b/colossalai/inference/struct.py
index 1a3094a27e2d..65d284296bcb 100644
--- a/colossalai/inference/struct.py
+++ b/colossalai/inference/struct.py
@@ -2,6 +2,7 @@
 from dataclasses import dataclass
 from typing import Any, List
 
+from colossalai.inference.config import DiffusionGenerationConfig
 from colossalai.logging import get_dist_logger
 
 logger = get_dist_logger(__name__)
@@ -46,6 +47,17 @@ def is_waiting(status: "RequestStatus") -> bool:
         return status == RequestStatus.WAITING
 
 
+@dataclass
+class DiffusionSequence:
+    """
+    parameters for diffusion
+    """
+
+    request_id: int
+    prompt: str
+    generation_config: DiffusionGenerationConfig
+
+
 @dataclass
 class Sequence:
     """Store information of input sequence.
diff --git a/colossalai/inference/utils.py b/colossalai/inference/utils.py
index 332e84d374b0..d0851e362318 100644
--- a/colossalai/inference/utils.py
+++ b/colossalai/inference/utils.py
@@ -5,10 +5,12 @@
 import math
 import os
 import re
+from enum import Enum
 from pathlib import Path
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Union
 
 import torch
+from diffusers import DiffusionPipeline
 from torch import nn
 
 from colossalai.logging import get_dist_logger
@@ -159,3 +161,36 @@ def can_use_flash_attn2(dtype: torch.dtype) -> bool:
     except ImportError:
         logger.warning(f"flash_attn2 has not been installed yet, we will use triton flash attn instead.")
         return False
+
+
+class ModelType(Enum):
+    DIFFUSION_MODEL = "Diffusion Model"
+    LLM = "Large Language Model (LLM)"
+    UNKNOWN = "Unknown Model Type"
+
+
+def get_model_type(model_or_path: Union[nn.Module, str, DiffusionPipeline]):
+    if isinstance(model_or_path, DiffusionPipeline):
+        return ModelType.DIFFUSION_MODEL
+    elif isinstance(model_or_path, nn.Module):
+        return ModelType.LLM
+    elif isinstance(model_or_path, str):
+        try:
+            from transformers import AutoConfig
+
+            hf_config = AutoConfig.from_pretrained(model_or_path, trust_remote_code=True)
+            return ModelType.LLM
+        except:
+            """
+            model type is not `ModelType.LLM`
+            """
+
+        try:
+            DiffusionPipeline.load_config(model_or_path)
+            return ModelType.DIFFUSION_MODEL
+        except:
+            """
+            model type is not `ModelType.DIFFUSION_MODEL`
+            """
+    else:
+        return ModelType.UNKNOWN
diff --git a/colossalai/initialize.py b/colossalai/initialize.py
index 71d42312ee7d..4e2eff7ce352 100644
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@@ -3,6 +3,12 @@
 
 import os
 
+# set CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that when overlapping communication and computation,
+# the order of of kernel launches on GPUs are the same as on the CPU so that comm is launched first.
+# see https://github.com/NVIDIA/Megatron-LM/issues/533
+# https://forums.developer.nvidia.com/t/how-many-streams-maximum-number-of-streams/6571/16
+os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
+
 import torch.distributed as dist
 
 from colossalai.accelerator import get_accelerator
diff --git a/colossalai/shardformer/layer/moe/__init__.py b/colossalai/legacy/moe/layer/__init__.py
similarity index 100%
rename from colossalai/shardformer/layer/moe/__init__.py
rename to colossalai/legacy/moe/layer/__init__.py
diff --git a/colossalai/shardformer/layer/moe/experts.py b/colossalai/legacy/moe/layer/experts.py
similarity index 95%
rename from colossalai/shardformer/layer/moe/experts.py
rename to colossalai/legacy/moe/layer/experts.py
index 1be7a27547ed..8088cf44e473 100644
--- a/colossalai/shardformer/layer/moe/experts.py
+++ b/colossalai/legacy/moe/layer/experts.py
@@ -5,9 +5,9 @@
 import torch.nn as nn
 
 from colossalai.kernel.triton.llama_act_combine_kernel import HAS_TRITON
-from colossalai.moe._operation import MoeInGradScaler, MoeOutGradScaler
-from colossalai.moe.manager import MOE_MANAGER
-from colossalai.moe.utils import get_activation
+from colossalai.legacy.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.utils import get_activation
+from colossalai.moe._operation import EPGradScalerIn, EPGradScalerOut
 from colossalai.shardformer.layer.utils import Randomizer
 from colossalai.tensor.moe_tensor.api import get_ep_rank, get_ep_size
 
@@ -118,7 +118,7 @@ def forward(
         Returns:
             torch.Tensor: The output tensor of shape (num_groups, num_experts, capacity, hidden_size)
         """
-        x = MoeInGradScaler.apply(x, self.ep_size)
+        x = EPGradScalerIn.apply(x, self.ep_size)
 
         e = x.size(1)
         h = x.size(-1)
@@ -157,5 +157,5 @@ def forward(
         x = torch.cat([x[i].unsqueeze(0) for i in range(e)], dim=0)
         x = x.reshape(inshape)
         x = x.transpose(0, 1).contiguous()
-        x = MoeOutGradScaler.apply(x, self.ep_size)
+        x = EPGradScalerOut.apply(x, self.ep_size)
         return x
diff --git a/colossalai/shardformer/layer/moe/layers.py b/colossalai/legacy/moe/layer/layers.py
similarity index 99%
rename from colossalai/shardformer/layer/moe/layers.py
rename to colossalai/legacy/moe/layer/layers.py
index e5b0ef97fd87..e43966f68a8c 100644
--- a/colossalai/shardformer/layer/moe/layers.py
+++ b/colossalai/legacy/moe/layer/layers.py
@@ -7,9 +7,9 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
+from colossalai.legacy.moe.load_balance import LoadBalancer
+from colossalai.legacy.moe.utils import create_ep_hierarchical_group, get_noise_generator
 from colossalai.moe._operation import AllGather, AllToAll, HierarchicalAllToAll, MoeCombine, MoeDispatch, ReduceScatter
-from colossalai.moe.load_balance import LoadBalancer
-from colossalai.moe.utils import create_ep_hierarchical_group, get_noise_generator
 from colossalai.shardformer.layer.moe import MLPExperts
 from colossalai.tensor.moe_tensor.api import get_dp_group, get_ep_group, get_ep_group_ranks, get_ep_size
 
diff --git a/colossalai/shardformer/layer/moe/routers.py b/colossalai/legacy/moe/layer/routers.py
similarity index 95%
rename from colossalai/shardformer/layer/moe/routers.py
rename to colossalai/legacy/moe/layer/routers.py
index 1be7a27547ed..8088cf44e473 100644
--- a/colossalai/shardformer/layer/moe/routers.py
+++ b/colossalai/legacy/moe/layer/routers.py
@@ -5,9 +5,9 @@
 import torch.nn as nn
 
 from colossalai.kernel.triton.llama_act_combine_kernel import HAS_TRITON
-from colossalai.moe._operation import MoeInGradScaler, MoeOutGradScaler
-from colossalai.moe.manager import MOE_MANAGER
-from colossalai.moe.utils import get_activation
+from colossalai.legacy.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.utils import get_activation
+from colossalai.moe._operation import EPGradScalerIn, EPGradScalerOut
 from colossalai.shardformer.layer.utils import Randomizer
 from colossalai.tensor.moe_tensor.api import get_ep_rank, get_ep_size
 
@@ -118,7 +118,7 @@ def forward(
         Returns:
             torch.Tensor: The output tensor of shape (num_groups, num_experts, capacity, hidden_size)
         """
-        x = MoeInGradScaler.apply(x, self.ep_size)
+        x = EPGradScalerIn.apply(x, self.ep_size)
 
         e = x.size(1)
         h = x.size(-1)
@@ -157,5 +157,5 @@ def forward(
         x = torch.cat([x[i].unsqueeze(0) for i in range(e)], dim=0)
         x = x.reshape(inshape)
         x = x.transpose(0, 1).contiguous()
-        x = MoeOutGradScaler.apply(x, self.ep_size)
+        x = EPGradScalerOut.apply(x, self.ep_size)
         return x
diff --git a/colossalai/moe/load_balance.py b/colossalai/legacy/moe/load_balance.py
similarity index 99%
rename from colossalai/moe/load_balance.py
rename to colossalai/legacy/moe/load_balance.py
index 3dc6c02c7445..7339b1a7b0eb 100644
--- a/colossalai/moe/load_balance.py
+++ b/colossalai/legacy/moe/load_balance.py
@@ -7,7 +7,7 @@
 from torch.distributed import ProcessGroup
 
 from colossalai.cluster import ProcessGroupMesh
-from colossalai.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.manager import MOE_MANAGER
 from colossalai.shardformer.layer.moe import MLPExperts
 from colossalai.zero.low_level import LowLevelZeroOptimizer
 
diff --git a/colossalai/moe/manager.py b/colossalai/legacy/moe/manager.py
similarity index 100%
rename from colossalai/moe/manager.py
rename to colossalai/legacy/moe/manager.py
diff --git a/examples/language/openmoe/README.md b/colossalai/legacy/moe/openmoe/README.md
similarity index 100%
rename from examples/language/openmoe/README.md
rename to colossalai/legacy/moe/openmoe/README.md
diff --git a/examples/language/openmoe/benchmark/benchmark_cai.py b/colossalai/legacy/moe/openmoe/benchmark/benchmark_cai.py
similarity index 99%
rename from examples/language/openmoe/benchmark/benchmark_cai.py
rename to colossalai/legacy/moe/openmoe/benchmark/benchmark_cai.py
index b9ef915c32a4..5f9447246ae4 100644
--- a/examples/language/openmoe/benchmark/benchmark_cai.py
+++ b/colossalai/legacy/moe/openmoe/benchmark/benchmark_cai.py
@@ -18,9 +18,9 @@
 from colossalai.booster import Booster
 from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
 from colossalai.cluster import DistCoordinator
+from colossalai.legacy.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.utils import skip_init
 from colossalai.moe.layers import apply_load_balance
-from colossalai.moe.manager import MOE_MANAGER
-from colossalai.moe.utils import skip_init
 from colossalai.nn.optimizer import HybridAdam
 
 
diff --git a/examples/language/openmoe/benchmark/benchmark_cai.sh b/colossalai/legacy/moe/openmoe/benchmark/benchmark_cai.sh
similarity index 100%
rename from examples/language/openmoe/benchmark/benchmark_cai.sh
rename to colossalai/legacy/moe/openmoe/benchmark/benchmark_cai.sh
diff --git a/examples/language/openmoe/benchmark/benchmark_cai_dist.sh b/colossalai/legacy/moe/openmoe/benchmark/benchmark_cai_dist.sh
similarity index 100%
rename from examples/language/openmoe/benchmark/benchmark_cai_dist.sh
rename to colossalai/legacy/moe/openmoe/benchmark/benchmark_cai_dist.sh
diff --git a/examples/language/openmoe/benchmark/benchmark_fsdp.py b/colossalai/legacy/moe/openmoe/benchmark/benchmark_fsdp.py
similarity index 98%
rename from examples/language/openmoe/benchmark/benchmark_fsdp.py
rename to colossalai/legacy/moe/openmoe/benchmark/benchmark_fsdp.py
index b00fbd001022..1ae94dd90977 100644
--- a/examples/language/openmoe/benchmark/benchmark_fsdp.py
+++ b/colossalai/legacy/moe/openmoe/benchmark/benchmark_fsdp.py
@@ -14,7 +14,7 @@
 from transformers.models.llama import LlamaConfig
 from utils import PerformanceEvaluator, get_model_numel
 
-from colossalai.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.manager import MOE_MANAGER
 
 
 class RandomDataset(Dataset):
diff --git a/examples/language/openmoe/benchmark/benchmark_fsdp.sh b/colossalai/legacy/moe/openmoe/benchmark/benchmark_fsdp.sh
similarity index 100%
rename from examples/language/openmoe/benchmark/benchmark_fsdp.sh
rename to colossalai/legacy/moe/openmoe/benchmark/benchmark_fsdp.sh
diff --git a/examples/language/openmoe/benchmark/hostfile.txt b/colossalai/legacy/moe/openmoe/benchmark/hostfile.txt
similarity index 100%
rename from examples/language/openmoe/benchmark/hostfile.txt
rename to colossalai/legacy/moe/openmoe/benchmark/hostfile.txt
diff --git a/examples/language/openmoe/benchmark/utils.py b/colossalai/legacy/moe/openmoe/benchmark/utils.py
similarity index 100%
rename from examples/language/openmoe/benchmark/utils.py
rename to colossalai/legacy/moe/openmoe/benchmark/utils.py
diff --git a/examples/language/openmoe/infer.py b/colossalai/legacy/moe/openmoe/infer.py
similarity index 100%
rename from examples/language/openmoe/infer.py
rename to colossalai/legacy/moe/openmoe/infer.py
diff --git a/examples/language/openmoe/infer.sh b/colossalai/legacy/moe/openmoe/infer.sh
similarity index 100%
rename from examples/language/openmoe/infer.sh
rename to colossalai/legacy/moe/openmoe/infer.sh
diff --git a/examples/language/openmoe/model/__init__.py b/colossalai/legacy/moe/openmoe/model/__init__.py
similarity index 100%
rename from examples/language/openmoe/model/__init__.py
rename to colossalai/legacy/moe/openmoe/model/__init__.py
diff --git a/examples/language/openmoe/model/convert_openmoe_ckpt.py b/colossalai/legacy/moe/openmoe/model/convert_openmoe_ckpt.py
similarity index 100%
rename from examples/language/openmoe/model/convert_openmoe_ckpt.py
rename to colossalai/legacy/moe/openmoe/model/convert_openmoe_ckpt.py
diff --git a/examples/language/openmoe/model/convert_openmoe_ckpt.sh b/colossalai/legacy/moe/openmoe/model/convert_openmoe_ckpt.sh
similarity index 100%
rename from examples/language/openmoe/model/convert_openmoe_ckpt.sh
rename to colossalai/legacy/moe/openmoe/model/convert_openmoe_ckpt.sh
diff --git a/examples/language/openmoe/model/modeling_openmoe.py b/colossalai/legacy/moe/openmoe/model/modeling_openmoe.py
similarity index 99%
rename from examples/language/openmoe/model/modeling_openmoe.py
rename to colossalai/legacy/moe/openmoe/model/modeling_openmoe.py
index 1febacd7d226..5d6e91765883 100644
--- a/examples/language/openmoe/model/modeling_openmoe.py
+++ b/colossalai/legacy/moe/openmoe/model/modeling_openmoe.py
@@ -50,8 +50,8 @@
 except:
     HAS_FLASH_ATTN = False
 from colossalai.kernel.triton.llama_act_combine_kernel import HAS_TRITON
-from colossalai.moe.manager import MOE_MANAGER
-from colossalai.moe.utils import get_activation, set_moe_args
+from colossalai.legacy.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.utils import get_activation, set_moe_args
 from colossalai.shardformer.layer.moe import SparseMLP
 
 if HAS_TRITON:
diff --git a/examples/language/openmoe/model/openmoe_8b_config.json b/colossalai/legacy/moe/openmoe/model/openmoe_8b_config.json
similarity index 100%
rename from examples/language/openmoe/model/openmoe_8b_config.json
rename to colossalai/legacy/moe/openmoe/model/openmoe_8b_config.json
diff --git a/examples/language/openmoe/model/openmoe_base_config.json b/colossalai/legacy/moe/openmoe/model/openmoe_base_config.json
similarity index 100%
rename from examples/language/openmoe/model/openmoe_base_config.json
rename to colossalai/legacy/moe/openmoe/model/openmoe_base_config.json
diff --git a/examples/language/openmoe/model/openmoe_policy.py b/colossalai/legacy/moe/openmoe/model/openmoe_policy.py
similarity index 99%
rename from examples/language/openmoe/model/openmoe_policy.py
rename to colossalai/legacy/moe/openmoe/model/openmoe_policy.py
index f46062128563..ccd566b08594 100644
--- a/examples/language/openmoe/model/openmoe_policy.py
+++ b/colossalai/legacy/moe/openmoe/model/openmoe_policy.py
@@ -9,7 +9,7 @@
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.utils import logging
 
-from colossalai.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.manager import MOE_MANAGER
 from colossalai.pipeline.stage_manager import PipelineStageManager
 from colossalai.shardformer.layer import FusedRMSNorm, Linear1D_Col
 from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
diff --git a/examples/language/openmoe/requirements.txt b/colossalai/legacy/moe/openmoe/requirements.txt
similarity index 100%
rename from examples/language/openmoe/requirements.txt
rename to colossalai/legacy/moe/openmoe/requirements.txt
diff --git a/examples/language/openmoe/test_ci.sh b/colossalai/legacy/moe/openmoe/test_ci.sh
similarity index 100%
rename from examples/language/openmoe/test_ci.sh
rename to colossalai/legacy/moe/openmoe/test_ci.sh
diff --git a/examples/language/openmoe/train.py b/colossalai/legacy/moe/openmoe/train.py
similarity index 99%
rename from examples/language/openmoe/train.py
rename to colossalai/legacy/moe/openmoe/train.py
index ff0e4bad6ee3..0173f0964453 100644
--- a/examples/language/openmoe/train.py
+++ b/colossalai/legacy/moe/openmoe/train.py
@@ -19,7 +19,7 @@
 from colossalai.booster import Booster
 from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
 from colossalai.cluster import DistCoordinator
-from colossalai.moe.utils import skip_init
+from colossalai.legacy.moe.utils import skip_init
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.shardformer.layer.moe import apply_load_balance
 
diff --git a/examples/language/openmoe/train.sh b/colossalai/legacy/moe/openmoe/train.sh
similarity index 100%
rename from examples/language/openmoe/train.sh
rename to colossalai/legacy/moe/openmoe/train.sh
diff --git a/colossalai/moe/utils.py b/colossalai/legacy/moe/utils.py
similarity index 99%
rename from colossalai/moe/utils.py
rename to colossalai/legacy/moe/utils.py
index 3d08ab7dd9b0..d91c41363316 100644
--- a/colossalai/moe/utils.py
+++ b/colossalai/legacy/moe/utils.py
@@ -9,7 +9,7 @@
 from torch.distributed.distributed_c10d import get_process_group_ranks
 
 from colossalai.accelerator import get_accelerator
-from colossalai.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.manager import MOE_MANAGER
 from colossalai.tensor.moe_tensor.api import is_moe_tensor
 
 
diff --git a/colossalai/legacy/nn/layer/parallel_1d/_operation.py b/colossalai/legacy/nn/layer/parallel_1d/_operation.py
index f01da97ba39a..8b8f04ccf456 100644
--- a/colossalai/legacy/nn/layer/parallel_1d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_1d/_operation.py
@@ -81,7 +81,6 @@ def backward(ctx, grad_output):
             handle = dist.all_reduce(grad_input, group=gpc.get_group(ctx.parallel_mode), async_op=True)
             # Delay the start of weight gradient computation shortly (3us) to have
             # all-reduce scheduled first and have GPU resources allocated
-            _ = torch.empty(1, device=grad_output.device) + 1
 
         grad_weight = grad_output.t().matmul(total_input)
         grad_bias = grad_output.sum(dim=0) if use_bias else None
diff --git a/colossalai/moe/__init__.py b/colossalai/moe/__init__.py
index 0623d19efd5f..e69de29bb2d1 100644
--- a/colossalai/moe/__init__.py
+++ b/colossalai/moe/__init__.py
@@ -1,5 +0,0 @@
-from .manager import MOE_MANAGER
-
-__all__ = [
-    "MOE_MANAGER",
-]
diff --git a/colossalai/moe/_operation.py b/colossalai/moe/_operation.py
index 01c837ee36ad..ac422a4da98f 100644
--- a/colossalai/moe/_operation.py
+++ b/colossalai/moe/_operation.py
@@ -290,7 +290,7 @@ def moe_cumsum(inputs: Tensor, use_kernel: bool = False):
         return torch.cumsum(inputs, dim=0) - 1
 
 
-class MoeInGradScaler(torch.autograd.Function):
+class EPGradScalerIn(torch.autograd.Function):
     """
     Scale the gradient back by the number of experts
     because the batch size increases in the moe stage
@@ -298,8 +298,7 @@ class MoeInGradScaler(torch.autograd.Function):
 
     @staticmethod
     def forward(ctx: Any, inputs: Tensor, ep_size: int) -> Tensor:
-        if ctx is not None:
-            ctx.ep_size = ep_size
+        ctx.ep_size = ep_size
         return inputs
 
     @staticmethod
@@ -311,7 +310,7 @@ def backward(ctx: Any, *grad_outputs: Tensor) -> Tuple[Tensor, None]:
         return grad, None
 
 
-class MoeOutGradScaler(torch.autograd.Function):
+class EPGradScalerOut(torch.autograd.Function):
     """
     Scale the gradient by the number of experts
     because the batch size increases in the moe stage
@@ -331,6 +330,50 @@ def backward(ctx: Any, *grad_outputs: Tensor) -> Tuple[Tensor, None]:
         return grad, None
 
 
+class DPGradScalerIn(torch.autograd.Function):
+    """
+    Scale the gradient back by the number of experts
+    because the batch size increases in the moe stage
+    """
+
+    @staticmethod
+    def forward(ctx: Any, inputs: Tensor, moe_dp_size: int, activated_experts: int) -> Tensor:
+        assert activated_experts != 0, f"shouldn't be called when no expert is activated"
+        ctx.moe_dp_size = moe_dp_size
+        ctx.activated_experts = activated_experts
+        return inputs
+
+    @staticmethod
+    def backward(ctx: Any, *grad_outputs: Tensor) -> Tuple[Tensor, None, None]:
+        assert len(grad_outputs) == 1
+        grad = grad_outputs[0]
+        if ctx.moe_dp_size != ctx.activated_experts:
+            grad.mul_(ctx.activated_experts / ctx.moe_dp_size)
+        return grad, None, None
+
+
+class DPGradScalerOut(torch.autograd.Function):
+    """
+    Scale the gradient by the number of experts
+    because the batch size increases in the moe stage
+    """
+
+    @staticmethod
+    def forward(ctx: Any, inputs: Tensor, moe_dp_size: int, activated_experts: int) -> Tensor:
+        assert activated_experts != 0, f"shouldn't be called when no expert is activated"
+        ctx.moe_dp_size = moe_dp_size
+        ctx.activated_experts = activated_experts
+        return inputs
+
+    @staticmethod
+    def backward(ctx: Any, *grad_outputs: Tensor) -> Tuple[Tensor, None, None]:
+        assert len(grad_outputs) == 1
+        grad = grad_outputs[0]
+        if ctx.moe_dp_size != ctx.activated_experts:
+            grad.mul_(ctx.moe_dp_size / ctx.activated_experts)
+        return grad, None, None
+
+
 def _all_to_all(
     inputs: torch.Tensor,
     input_split_sizes: Optional[List[int]] = None,
@@ -393,4 +436,7 @@ def all_to_all_uneven(
     group=None,
     overlap: bool = False,
 ):
+    assert (
+        inputs.requires_grad
+    ), "Input must require grad to assure that backward is executed, otherwise it might hang the program."
     return AllToAllUneven.apply(inputs, input_split_sizes, output_split_sizes, group, overlap)
diff --git a/colossalai/pipeline/p2p.py b/colossalai/pipeline/p2p.py
index ed190eb0885f..b7b2842136c5 100644
--- a/colossalai/pipeline/p2p.py
+++ b/colossalai/pipeline/p2p.py
@@ -91,7 +91,11 @@ def _broadcast_object_list(
     my_rank = dist.get_rank()
     # Serialize object_list elements to tensors on src rank.
     if my_rank == src:
-        if Version(torch.__version__) >= Version("1.13.0"):
+        if Version(torch.__version__) >= Version("2.3.0"):
+            tensor_list, size_list = zip(
+                *[c10d._object_to_tensor(obj, device=current_device, group=group) for obj in object_list]
+            )
+        elif Version(torch.__version__) >= Version("1.13.0"):
             tensor_list, size_list = zip(*[c10d._object_to_tensor(obj, device=current_device) for obj in object_list])
         else:
             tensor_list, size_list = zip(*[c10d._object_to_tensor(obj) for obj in object_list])
@@ -276,7 +280,11 @@ def _send_recv_serialization_object(
     send_object_tensor = None
     send_object_size_tensor = None
     if object is not None and send_dst is not None:
-        if Version(torch.__version__) >= Version("1.13.0"):
+        if Version(torch.__version__) >= Version("2.3.0"):
+            send_object_tensor, send_object_size_tensor = c10d._object_to_tensor(
+                object, device=current_device, group=send_group
+            )
+        elif Version(torch.__version__) >= Version("1.13.0"):
             send_object_tensor, send_object_size_tensor = c10d._object_to_tensor(object, device=current_device)
         else:
             send_object_tensor, send_object_size_tensor = c10d._object_to_tensor(object)
diff --git a/colossalai/shardformer/layer/__init__.py b/colossalai/shardformer/layer/__init__.py
index f17fad1b6606..331e4972966c 100644
--- a/colossalai/shardformer/layer/__init__.py
+++ b/colossalai/shardformer/layer/__init__.py
@@ -3,7 +3,7 @@
 from .dropout import DropoutForParallelInput, DropoutForReplicatedInput
 from .embedding import Embedding1D, PaddingEmbedding, VocabParallelEmbedding1D
 from .linear import Linear1D_Col, Linear1D_Row, PaddingLMHead, VocabParallelLMHead1D
-from .loss import cross_entropy_1d
+from .loss import cross_entropy_1d, dist_cross_entropy
 from .normalization import FusedLayerNorm, FusedRMSNorm, LayerNorm, RMSNorm
 from .parallel_module import ParallelModule
 from .qkv_fused_linear import FusedLinear1D_Col, GPT2FusedLinearConv1D_Col, GPT2FusedLinearConv1D_Row
@@ -18,6 +18,7 @@
     "DropoutForParallelInput",
     "DropoutForReplicatedInput",
     "cross_entropy_1d",
+    "dist_cross_entropy",
     "BaseLayerNorm",
     "LayerNorm",
     "RMSNorm",
diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index a27fd35c192d..feebd2d0529d 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -146,7 +146,7 @@ def backward(ctx, grad_output):
         if use_bias:
             bias.view(bias.shape)
 
-        total_input = input
+        total_input = input.contiguous()
         grad_input = grad_output.matmul(weight)
         grad_output = grad_output.contiguous()
         # Convert the tensor shapes to 2D for execution compatibility
diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 141baf3d3770..5872c64856b9 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -139,12 +139,11 @@ def prepare_attn_kwargs(
             # no padding
             assert is_causal
             outputs["attention_mask_type"] = AttnMaskType.CAUSAL
-            attention_mask = torch.ones(s_q, s_kv, dtype=dtype, device=device).tril(diagonal=0).expand(b, s_q, s_kv)
+            attention_mask = torch.ones(s_q, s_kv, dtype=dtype, device=device)
+            if s_q != 1:
+                attention_mask = attention_mask.tril(diagonal=0)
+            attention_mask = attention_mask.expand(b, s_q, s_kv)
         else:
-            assert q_padding_mask.shape == (
-                b,
-                s_q,
-            ), f"q_padding_mask shape {q_padding_mask.shape} should be the same. ({shape_4d})"
             max_seqlen_q, cu_seqlens_q, q_indices = get_pad_info(q_padding_mask)
             if kv_padding_mask is None:
                 # self attention
@@ -156,7 +155,7 @@ def prepare_attn_kwargs(
                 b,
                 s_kv,
             ), f"q_padding_mask shape {kv_padding_mask.shape} should be the same. ({shape_4d})"
-            attention_mask = q_padding_mask[:, None, :].expand(b, s_kv, s_q).to(dtype=dtype, device=device)
+            attention_mask = kv_padding_mask[:, None, :].expand(b, s_q, s_kv).to(dtype=dtype, device=device)
             outputs.update(
                 {
                     "cu_seqlens_q": cu_seqlens_q,
@@ -169,7 +168,8 @@ def prepare_attn_kwargs(
             )
             if is_causal:
                 outputs["attention_mask_type"] = AttnMaskType.PADDED_CAUSAL
-                attention_mask = attention_mask * attention_mask.new_ones(s_q, s_kv).tril(diagonal=0)
+                if s_q != 1:
+                    attention_mask = attention_mask * attention_mask.new_ones(s_q, s_kv).tril(diagonal=0)
             else:
                 outputs["attention_mask_type"] = AttnMaskType.PADDED
         attention_mask = invert_mask(attention_mask).unsqueeze(1)
diff --git a/colossalai/shardformer/layer/loss.py b/colossalai/shardformer/layer/loss.py
index a6d19edf5b53..cea2da03fb58 100644
--- a/colossalai/shardformer/layer/loss.py
+++ b/colossalai/shardformer/layer/loss.py
@@ -2,8 +2,11 @@
 import torch.distributed as dist
 from torch.autograd import Function
 from torch.distributed import ProcessGroup
+from torch.nn import CrossEntropyLoss
 
-__all__ = ["DistCrossEntropy", "cross_entropy_1d"]
+from colossalai.shardformer.shard import ShardConfig
+
+__all__ = ["DistCrossEntropy", "cross_entropy_1d", "dist_cross_entropy"]
 
 
 class DistCrossEntropy(Function):
@@ -132,3 +135,43 @@ def cross_entropy_1d(
     dtype: torch.dtype = None,
 ) -> torch.Tensor:
     return DistCrossEntropy.apply(vocab_logits, labels, ignore_index, process_group, vocab_size, dtype)
+
+
+def dist_cross_entropy(
+    labels: torch.Tensor,
+    logits: torch.Tensor,
+    shard_config: ShardConfig,
+    out_features: int,
+    vocab_size: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    """
+    Helper to compute cross entropy loss for most shardformer models,
+    compatible with PP, TP and SP.
+    """
+    if labels is not None:
+        # Shift so that tokens < n predict n
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        # Flatten the tokens
+        loss_fct = CrossEntropyLoss()
+        shift_labels = shift_labels.view(-1)
+        shift_labels = shift_labels.to(shift_logits.device)
+        if shard_config.enable_tensor_parallelism and shard_config.parallel_output:
+            # Cross entropy with all-reduce for TP
+            new_vocab_size = logits.shape[-1]
+            shift_logits = shift_logits.view(-1, new_vocab_size)
+            loss = cross_entropy_1d(
+                shift_logits,
+                shift_labels,
+                process_group=shard_config.tensor_parallel_process_group,
+                vocab_size=out_features,
+                dtype=dtype,
+            )
+        else:
+            # NOTE if use TP and not parallel_output, the output is gathered.
+            # see VocabParallelLMHead1D
+            shift_logits = shift_logits.view(-1, vocab_size)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        return loss
diff --git a/colossalai/shardformer/layer/qkv_fused_linear.py b/colossalai/shardformer/layer/qkv_fused_linear.py
index d8425b58db4f..93a7eb231b0e 100644
--- a/colossalai/shardformer/layer/qkv_fused_linear.py
+++ b/colossalai/shardformer/layer/qkv_fused_linear.py
@@ -722,6 +722,7 @@ def from_native_module(
             process_group=process_group,
             weight=module.weight,
             bias_=module.bias,
+            n_fused=n_fused,
             *args,
             **kwargs,
         )
diff --git a/colossalai/shardformer/modeling/bloom.py b/colossalai/shardformer/modeling/bloom.py
index 1541436264e9..26ffef6c5ee0 100644
--- a/colossalai/shardformer/modeling/bloom.py
+++ b/colossalai/shardformer/modeling/bloom.py
@@ -28,7 +28,7 @@
 from colossalai.shardformer.layer._operation import gather_forward_split_backward, split_forward_gather_backward
 from colossalai.shardformer.shard import ShardConfig
 
-from ..layer import cross_entropy_1d
+from ..layer import dist_cross_entropy
 
 logger = logging.get_logger(__name__)
 
@@ -359,30 +359,14 @@ def bloom_for_causal_lm_forward(
             hidden_states = transformer_outputs[0]
             lm_logits = self.lm_head(hidden_states).contiguous()
 
-            loss = None
-            if labels is not None:
-                # move labels to correct device to enable model parallelism
-                labels = labels.to(lm_logits.device)
-                # Shift so that tokens < n predict n
-                shift_logits = lm_logits[..., :-1, :].contiguous()
-                shift_labels = labels[..., 1:].contiguous()
-                batch_size, seq_length, vocab_size = shift_logits.shape
-                # Flatten the tokens
-                if shard_config.enable_tensor_parallelism and shard_config.parallel_output:
-                    new_vocab_size = lm_logits.shape[-1]
-                    shift_logits = shift_logits.view(-1, new_vocab_size)
-                    shift_labels = shift_labels.view(-1)
-                    loss = cross_entropy_1d(
-                        shift_logits,
-                        shift_labels,
-                        process_group=shard_config.tensor_parallel_process_group,
-                        vocab_size=self.lm_head.out_features,
-                        dtype=self.transformer.dtype,
-                    )
-                else:
-                    loss_fct = CrossEntropyLoss()
-                    shift_logits = shift_logits.view(-1, self.config.vocab_size)
-                    loss = loss_fct(shift_logits, shift_labels.view(-1))
+            loss = dist_cross_entropy(
+                labels,
+                lm_logits,
+                shard_config,
+                self.lm_head.out_features,
+                self.config.vocab_size,
+                self.transformer.dtype,
+            )
 
             if not return_dict:
                 output = (lm_logits,) + transformer_outputs[1:]
@@ -1040,24 +1024,10 @@ def forward(
         hidden_states = transformer_outputs[0]
         lm_logits = self.lm_head(hidden_states)
 
-        loss = None
-        if labels is not None:
-            # move labels to correct device to enable model parallelism
-            labels = labels.to(lm_logits.device)
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            new_vocab_size = lm_logits.shape[-1]
-            shift_logits = shift_logits.view(-1, new_vocab_size)
-            shift_labels = shift_labels.view(-1)
-            loss = cross_entropy_1d(
-                shift_logits,
-                shift_labels,
-                process_group=shard_config.tensor_parallel_process_group,
-                vocab_size=self.lm_head.out_features,
-                dtype=self.transformer.dtype,
-            )
+        loss = dist_cross_entropy(
+            labels, lm_logits, shard_config, self.lm_head.out_features, self.config.vocab_size, self.transformer.dtype
+        )
+
         if not return_dict:
             output = (lm_logits,) + transformer_outputs[1:]
             return ((loss,) + output) if loss is not None else output
diff --git a/colossalai/shardformer/modeling/chatglm2.py b/colossalai/shardformer/modeling/chatglm2.py
index 53c151f02f63..34d900d8de94 100644
--- a/colossalai/shardformer/modeling/chatglm2.py
+++ b/colossalai/shardformer/modeling/chatglm2.py
@@ -11,7 +11,11 @@
 from colossalai.pipeline.stage_manager import PipelineStageManager
 from colossalai.shardformer import ShardConfig
 from colossalai.shardformer.layer import AttnMaskType, ColoAttention
-from colossalai.shardformer.layer._operation import gather_forward_split_backward, split_forward_gather_backward
+from colossalai.shardformer.layer._operation import (
+    all_to_all_comm,
+    gather_forward_split_backward,
+    split_forward_gather_backward,
+)
 
 
 def get_flash_core_attention_forward():
@@ -203,6 +207,13 @@ def chatglm_model_forward(
                     dim=0,
                     process_group=shard_config.tensor_parallel_process_group,
                 )
+            elif shard_config.sequence_parallelism_mode == "all_to_all":
+                hidden_states = split_forward_gather_backward(
+                    hidden_states,
+                    dim=0,
+                    process_group=shard_config.sequence_parallel_process_group,
+                    grad_scale=1 / shard_config.sequence_parallel_size,
+                )
         for idx in range(start_idx, end_idx):
             layer = self.encoder._get_layer(idx)
             if output_hidden_states:
@@ -235,6 +246,13 @@ def chatglm_model_forward(
                     dim=0,
                     process_group=shard_config.tensor_parallel_process_group,
                 )
+            elif shard_config.sequence_parallelism_mode == "all_to_all":
+                hidden_states = gather_forward_split_backward(
+                    hidden_states,
+                    dim=0,
+                    process_group=shard_config.sequence_parallel_process_group,
+                    grad_scale=shard_config.sequence_parallel_size,
+                )
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
         if stage_manager.is_last_stage():
@@ -329,7 +347,9 @@ def chatglm_for_conditional_generation_forward(
             return transformer_outputs
 
 
-def get_chatglm_sequence_parallel_forward_fn(shard_config: ShardConfig):
+def get_chatglm_sequence_parallel_forward_fn(shard_config: ShardConfig, sp_mode, sp_size, sp_group):
+    logger = logging.get_logger(__name__)
+
     def forward(
         self,
         input_ids,
@@ -381,13 +401,27 @@ def forward(
             rotary_pos_emb = rotary_pos_emb[None, :seq_length]
         rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
 
+        if sp_mode in ["all_to_all"] and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with sp mode `{sp_mode}`. Setting `use_cache=False`..."
+                )
+                use_cache = False
         # Run encoder.
         # [seq_len, batch_size, hidden_size] -> [seq_len/TP_size, batch_size, hidden_size]
-        inputs_embeds = split_forward_gather_backward(
-            inputs_embeds,
-            dim=0,
-            process_group=shard_config.tensor_parallel_process_group,
-        )
+        if sp_mode in ["split_gather"]:
+            inputs_embeds = split_forward_gather_backward(
+                inputs_embeds,
+                dim=0,
+                process_group=sp_group,
+            )
+        elif sp_mode == "all_to_all":
+            inputs_embeds = split_forward_gather_backward(
+                inputs_embeds,
+                dim=0,
+                process_group=sp_group,
+                grad_scale=1 / sp_size,
+            )
         hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
             inputs_embeds,
             full_attention_mask,
@@ -397,11 +431,19 @@ def forward(
             output_hidden_states=output_hidden_states,
         )
 
-        hidden_states = gather_forward_split_backward(
-            hidden_states,
-            dim=0,
-            process_group=shard_config.tensor_parallel_process_group,
-        )
+        if sp_mode in ["split_gather"]:
+            hidden_states = gather_forward_split_backward(
+                hidden_states,
+                dim=0,
+                process_group=shard_config.tensor_parallel_process_group,
+            )
+        elif sp_mode == "all_to_all":
+            hidden_states = gather_forward_split_backward(
+                hidden_states,
+                dim=0,
+                process_group=sp_group,
+                grad_scale=sp_size,
+            )
 
         if not return_dict:
             return tuple(
@@ -423,3 +465,158 @@ def forward(
         )
 
     return forward
+
+
+def get_chatglm_sequence_parallel_attention_forward(shard_config: ShardConfig, sp_mode, sp_size, sp_group):
+    from .chatglm2_6b.modeling_chatglm import apply_rotary_pos_emb, split_tensor_along_last_dim
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        rotary_pos_emb,
+        kv_cache=None,
+        use_cache=True,
+    ):
+        if sp_mode is not None:
+            assert sp_mode in ["all_to_all", "split_gather"], "Invalid sp_mode"
+            assert (sp_size is not None) and (
+                sp_group is not None
+            ), "Must specify sp_size and sp_group for sequence parallel"
+
+        mixed_x_layer = self.query_key_value(hidden_states)
+        if self.multi_query_attention:
+            (query_layer, key_layer, value_layer) = mixed_x_layer.split(
+                [
+                    self.num_attention_heads_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                    self.num_multi_query_groups_per_partition * self.hidden_size_per_attention_head,
+                ],
+                dim=-1,
+            )
+            query_layer = query_layer.view(
+                query_layer.size()[:-1]
+                + (
+                    self.num_attention_heads_per_partition,
+                    self.hidden_size_per_attention_head,
+                )
+            )
+            key_layer = key_layer.view(
+                key_layer.size()[:-1]
+                + (
+                    self.num_multi_query_groups_per_partition,
+                    self.hidden_size_per_attention_head,
+                )
+            )
+            value_layer = value_layer.view(
+                value_layer.size()[:-1]
+                + (
+                    self.num_multi_query_groups_per_partition,
+                    self.hidden_size_per_attention_head,
+                )
+            )
+        else:
+            new_tensor_shape = mixed_x_layer.size()[:-1] + (
+                self.num_attention_heads_per_partition,
+                3 * self.hidden_size_per_attention_head,
+            )
+            mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+            # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn]
+            (query_layer, key_layer, value_layer) = split_tensor_along_last_dim(mixed_x_layer, 3)
+
+        # sp: all-to-all comminucation when introducing sequence parallel
+        if sp_mode == "all_to_all":
+            sq, bs, _, _ = value_layer.size()
+
+            query_layer = query_layer.reshape(sq, bs, -1)
+            key_layer = key_layer.reshape(sq, bs, -1)
+            value_layer = value_layer.reshape(sq, bs, -1)
+
+            query_layer = all_to_all_comm(query_layer, sp_group, gather_dim=0)
+            key_layer = all_to_all_comm(key_layer, sp_group, gather_dim=0)
+            value_layer = all_to_all_comm(value_layer, sp_group, gather_dim=0)
+
+            query_layer = query_layer.view(
+                sq * sp_size,
+                bs,
+                self.num_attention_heads_per_partition // sp_size,
+                self.hidden_size_per_attention_head,
+            ).contiguous()
+
+            key_layer = key_layer.view(
+                sq * sp_size,
+                bs,
+                self.num_attention_heads_per_partition // sp_size,
+                self.hidden_size_per_attention_head,
+            ).contiguous()
+
+            value_layer = value_layer.view(
+                sq * sp_size,
+                bs,
+                self.num_attention_heads_per_partition // sp_size,
+                self.hidden_size_per_attention_head,
+            ).contiguous()
+
+        # apply relative positional encoding (rotary embedding)
+        if rotary_pos_emb is not None:
+            query_layer = apply_rotary_pos_emb(query_layer, rotary_pos_emb)
+            key_layer = apply_rotary_pos_emb(key_layer, rotary_pos_emb)
+
+        # adjust key and value for inference
+        if kv_cache is not None:
+            cache_k, cache_v = kv_cache
+            key_layer = torch.cat((cache_k, key_layer), dim=0)
+            value_layer = torch.cat((cache_v, value_layer), dim=0)
+        if use_cache:
+            kv_cache = (key_layer, value_layer)
+        else:
+            kv_cache = None
+
+        if self.multi_query_attention:
+            key_layer = key_layer.unsqueeze(-2)
+            key_layer = key_layer.expand(
+                -1,
+                -1,
+                -1,
+                self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition,
+                -1,
+            )
+            key_layer = key_layer.contiguous().view(
+                key_layer.size()[:2]
+                + (
+                    self.num_attention_heads_per_partition,
+                    self.hidden_size_per_attention_head,
+                )
+            )
+            value_layer = value_layer.unsqueeze(-2)
+            value_layer = value_layer.expand(
+                -1,
+                -1,
+                -1,
+                self.num_attention_heads_per_partition // self.num_multi_query_groups_per_partition,
+                -1,
+            )
+            value_layer = value_layer.contiguous().view(
+                value_layer.size()[:2]
+                + (
+                    self.num_attention_heads_per_partition // sp_size,
+                    self.hidden_size_per_attention_head,
+                )
+            )
+
+        # ==================================
+        # core attention computation
+        # ==================================
+
+        context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
+        if sp_mode == "all_to_all":
+            context_layer = all_to_all_comm(context_layer, sp_group, gather_dim=2, scatter_dim=0)
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+        output = self.dense(context_layer)
+
+        return output, kv_cache
+
+    return forward
diff --git a/colossalai/shardformer/modeling/command.py b/colossalai/shardformer/modeling/command.py
index 07a7f6cbf8d3..5b36fc7db3b9 100644
--- a/colossalai/shardformer/modeling/command.py
+++ b/colossalai/shardformer/modeling/command.py
@@ -5,7 +5,6 @@
 import torch
 import torch.utils.checkpoint
 from torch import nn
-from torch.nn import CrossEntropyLoss
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from transformers.models.cohere.modeling_cohere import (
@@ -25,7 +24,7 @@
 )
 from colossalai.shardformer.shard import ShardConfig
 
-from ..layer import ColoAttention, cross_entropy_1d
+from ..layer import ColoAttention, dist_cross_entropy
 
 
 class CommandPipelineForwards:
@@ -117,7 +116,7 @@ def command_model_forward(
         # for the other stages, hidden_states is the output of the previous stage
         if shard_config.enable_flash_attention:
             # in this case, attention_mask is a dict rather than a tensor
-            mask_shape = (batch_size, 1, seq_length_with_past, seq_length_with_past)
+            mask_shape = (batch_size, 1, seq_length, seq_length_with_past)
             attention_mask = ColoAttention.prepare_attn_kwargs(
                 mask_shape,
                 hidden_states.dtype,
@@ -135,6 +134,21 @@ def command_model_forward(
                 )
                 use_cache = False
 
+        if shard_config and shard_config.enable_sequence_parallelism:
+            if shard_config.sequence_parallelism_mode in ["split_gather", "ring"]:
+                hidden_states = split_forward_gather_backward(
+                    hidden_states,
+                    dim=1,
+                    process_group=shard_config.tensor_parallel_process_group,
+                )
+            elif shard_config.sequence_parallelism_mode == "all_to_all":
+                hidden_states = split_forward_gather_backward(
+                    hidden_states,
+                    dim=1,
+                    process_group=shard_config.sequence_parallel_process_group,
+                    grad_scale=1 / shard_config.sequence_parallel_size,
+                )
+
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
@@ -191,6 +205,21 @@ def command_model_forward(
         if stage_manager.is_last_stage():
             hidden_states = self.norm(hidden_states)
 
+        if shard_config and shard_config.enable_sequence_parallelism:
+            if shard_config.sequence_parallelism_mode in ["split_gather", "ring"]:
+                hidden_states = gather_forward_split_backward(
+                    hidden_states,
+                    dim=1,
+                    process_group=shard_config.tensor_parallel_process_group,
+                )
+            elif shard_config.sequence_parallelism_mode == "all_to_all":
+                hidden_states = gather_forward_split_backward(
+                    hidden_states,
+                    dim=1,
+                    process_group=shard_config.sequence_parallel_process_group,
+                    grad_scale=shard_config.sequence_parallel_size,
+                )
+
         # add hidden states from the last decoder layer
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
@@ -300,29 +329,9 @@ def command_for_causal_lm_forward(
             logits = self.lm_head(hidden_states)
             logits = logits * self.logit_scale
             logits = logits.float()
-            loss = None
-            if labels is not None:
-                # Shift so that tokens < n predict n
-                shift_logits = logits[..., :-1, :].contiguous()
-                shift_labels = labels[..., 1:].contiguous()
-                # Flatten the tokens
-                loss_fct = CrossEntropyLoss()
-                shift_labels = shift_labels.view(-1)
-                # Enable model parallelism
-                shift_labels = shift_labels.to(shift_logits.device)
-                if shard_config.enable_tensor_parallelism and shard_config.parallel_output:
-                    new_vocab_size = logits.shape[-1]
-                    shift_logits = shift_logits.view(-1, new_vocab_size)
-                    loss = cross_entropy_1d(
-                        shift_logits,
-                        shift_labels,
-                        process_group=shard_config.tensor_parallel_process_group,
-                        vocab_size=self.lm_head.out_features,
-                        dtype=self.model.dtype,
-                    )
-                else:
-                    shift_logits = shift_logits.view(-1, self.config.vocab_size)
-                    loss = loss_fct(shift_logits, shift_labels)
+            loss = dist_cross_entropy(
+                labels, logits, shard_config, self.lm_head.out_features, self.config.vocab_size, self.model.dtype
+            )
 
             if not return_dict:
                 output = (logits,) + outputs[1:]
@@ -658,24 +667,14 @@ def forward(
         logits = self.lm_head(hidden_states)
         logits = logits * self.logit_scale
         logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            new_vocab_size = logits.shape[-1]
-            shift_logits = shift_logits.view(-1, new_vocab_size)
-            loss = cross_entropy_1d(
-                shift_logits,
-                shift_labels,
-                process_group=shard_config.tensor_parallel_process_group,
-                vocab_size=self.lm_head.out_features,
-                dtype=self.model.dtype,
-            )
+        loss = dist_cross_entropy(
+            labels,
+            logits,
+            shard_config,
+            self.lm_head.out_features,
+            self.config.vocab_size,
+            self.model.dtype,
+        )
 
         if not return_dict:
             output = (logits,) + outputs[1:]
diff --git a/colossalai/shardformer/modeling/deepseek.py b/colossalai/shardformer/modeling/deepseek.py
new file mode 100644
index 000000000000..a84a3097231a
--- /dev/null
+++ b/colossalai/shardformer/modeling/deepseek.py
@@ -0,0 +1,754 @@
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed import ProcessGroup
+from torch.nn import CrossEntropyLoss
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import (
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
+from transformers.utils import is_flash_attn_2_available, logging
+
+from colossalai.lazy import LazyInitContext
+from colossalai.moe._operation import (
+    DPGradScalerIn,
+    DPGradScalerOut,
+    EPGradScalerIn,
+    EPGradScalerOut,
+    all_to_all_uneven,
+)
+from colossalai.pipeline.stage_manager import PipelineStageManager
+from colossalai.shardformer.layer._operation import (
+    all_to_all_comm,
+    gather_forward_split_backward,
+    split_forward_gather_backward,
+)
+from colossalai.shardformer.layer.linear import Linear1D_Col, Linear1D_Row
+from colossalai.shardformer.shard import ShardConfig
+from colossalai.shardformer.shard.utils import set_tensors_to_none
+from colossalai.tensor.moe_tensor.api import set_moe_tensor_ep_group
+
+
+# copied from modeling_deepseek.py
+class AddAuxiliaryLoss(torch.autograd.Function):
+    """
+    The trick function of adding auxiliary (aux) loss,
+    which includes the gradient of the aux loss during backpropagation.
+    """
+
+    @staticmethod
+    def forward(ctx, x, loss):
+        assert loss.numel() == 1
+        ctx.dtype = loss.dtype
+        ctx.required_aux_loss = loss.requires_grad
+        return x
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_loss = None
+        if ctx.required_aux_loss:
+            grad_loss = torch.ones(1, dtype=ctx.dtype, device=grad_output.device)
+        return grad_output, grad_loss
+
+
+class EPDeepseekMoE(nn.Module):
+    def __init__(self):
+        raise RuntimeError(f"Please use `from_native_module` to create an instance of {self.__class__.__name__}")
+
+    def setup_process_groups(self, tp_group: ProcessGroup, moe_dp_group: ProcessGroup, ep_group: ProcessGroup):
+        assert tp_group is not None
+        assert moe_dp_group is not None
+        assert ep_group is not None
+
+        self.ep_size = dist.get_world_size(ep_group)
+        self.ep_rank = dist.get_rank(ep_group)
+        self.num_experts = self.config.n_routed_experts
+        assert self.num_experts % self.ep_size == 0
+
+        self.ep_group = ep_group
+        self.num_experts_per_ep = self.num_experts // self.ep_size
+        self.expert_start_idx = self.ep_rank * self.num_experts_per_ep
+        held_experts = self.experts[self.expert_start_idx : self.expert_start_idx + self.num_experts_per_ep]
+
+        set_tensors_to_none(self.experts, exclude=set(held_experts))
+
+        # setup moe_dp group
+        self.moe_dp_group = moe_dp_group
+        self.moe_dp_size = moe_dp_group.size()
+
+        # setup tp group
+        self.tp_group = tp_group
+        if self.tp_group.size() > 1:
+            for expert in held_experts:
+                expert.gate_proj = Linear1D_Col.from_native_module(expert.gate_proj, self.tp_group)
+                expert.up_proj = Linear1D_Col.from_native_module(expert.up_proj, self.tp_group)
+                expert.down_proj = Linear1D_Row.from_native_module(expert.down_proj, self.tp_group)
+
+        for p in self.experts.parameters():
+            set_moe_tensor_ep_group(p, ep_group)
+
+    @staticmethod
+    def from_native_module(
+        module,
+        tp_group: ProcessGroup,
+        moe_dp_group: ProcessGroup,
+        ep_group: ProcessGroup,
+        *args,
+        **kwargs,
+    ) -> "EPDeepseekMoE":
+        LazyInitContext.materialize(module)
+        if module.__class__.__name__ == "DeepseekMLP":
+            return module
+        module.__class__ = EPDeepseekMoE
+        module.setup_process_groups(tp_group, moe_dp_group, ep_group)
+        return module
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        identity = hidden_states
+        orig_shape = hidden_states.shape
+
+        topk_experts_idx, topk_experts_weight, aux_loss = self.gate(hidden_states)
+
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])  # [t0, t1, t2 ...]
+        hidden_states = hidden_states.repeat_interleave(
+            self.num_experts_per_tok, dim=0
+        )  # after repeat_interleave: [t0 t0 t1 t1 t2 t2 ... ]
+
+        flat_topk_experts_idx = topk_experts_idx.view(-1)  # [e0 e1 e2 ...]
+        # The elements of flat_topk_token_idx are token ids, which are arranged in ascending order of expert ids.
+        flat_topk_token_idx = flat_topk_experts_idx.argsort()
+
+        # Now we adjust the order of the hidden states, also in ascending order of expert id
+        dispatch_states = hidden_states[flat_topk_token_idx]
+        input_split_sizes = flat_topk_experts_idx.bincount(minlength=self.num_experts)  # [n0, n1, n2, n3]
+        output_split_sizes = torch.zeros_like(input_split_sizes)
+
+        # [n0, n1, n2, n3] [m0, m1, m2, m3] -> [n0, n1, m0, m1] [n2, n3, m2, m3]
+        dist.all_to_all_single(output_split_sizes, input_split_sizes, group=self.ep_group)
+
+        with torch.no_grad():
+            activate_experts = output_split_sizes[: self.num_experts_per_ep].clone()
+            for i in range(1, self.ep_size):
+                activate_experts += output_split_sizes[i * self.num_experts_per_ep : (i + 1) * self.num_experts_per_ep]
+            activate_experts = (activate_experts > 0).float()
+        dist.all_reduce(activate_experts, group=self.moe_dp_group)
+
+        input_split_list = input_split_sizes.view(self.ep_size, self.num_experts_per_ep).sum(dim=-1).tolist()
+        output_split_list = output_split_sizes.view(self.ep_size, self.num_experts_per_ep).sum(dim=-1).tolist()
+        output_states, _ = all_to_all_uneven(dispatch_states, input_split_list, output_split_list, self.ep_group)
+        output_states = EPGradScalerIn.apply(output_states, self.ep_size)
+
+        if output_states.size(0) > 0:
+            if self.num_experts_per_ep == 1:
+                expert = self.experts[self.expert_start_idx]
+                output_states = DPGradScalerIn.apply(output_states, self.moe_dp_size, activate_experts[0])
+                output_states = expert(output_states)
+                output_states = DPGradScalerOut.apply(output_states, self.moe_dp_size, activate_experts[0])
+            else:
+                output_states_splits = output_states.split(output_split_sizes.tolist())
+                output_states_list = []
+                for i, split_states in enumerate(output_states_splits):
+                    if split_states.size(0) == 0:  # no token routed to this experts
+                        continue
+                    expert = self.experts[self.expert_start_idx + i % self.num_experts_per_ep]
+                    split_states = DPGradScalerIn.apply(
+                        split_states, self.moe_dp_size, activate_experts[i % self.num_experts_per_ep]
+                    )
+                    split_states = expert(split_states)
+                    split_states = DPGradScalerOut.apply(
+                        split_states, self.moe_dp_size, activate_experts[i % self.num_experts_per_ep]
+                    )
+                    output_states_list.append(split_states)
+                output_states = torch.cat(output_states_list)
+        output_states = EPGradScalerOut.apply(output_states, self.ep_size)
+        dispatch_states, _ = all_to_all_uneven(output_states, output_split_list, input_split_list, self.ep_group)
+        recover_token_idx = torch.empty_like(flat_topk_token_idx)
+        recover_token_idx[flat_topk_token_idx] = torch.arange(
+            flat_topk_token_idx.size(0), device=flat_topk_token_idx.device
+        )
+
+        output_hidden_states = dispatch_states[recover_token_idx]  # t0 t0 t1 t1 t2 t2
+        output_hidden_states = output_hidden_states.view(-1, self.num_experts_per_tok, orig_shape[-1])
+        output_hidden_states = (output_hidden_states * topk_experts_weight[:, :, None]).sum(dim=-2)  # (B*S, h)
+        output_hidden_states = output_hidden_states.view(*orig_shape)
+        output_hidden_states = AddAuxiliaryLoss.apply(output_hidden_states, aux_loss)
+        if self.config.n_shared_experts is not None:
+            output_hidden_states = output_hidden_states + self.shared_experts(identity)
+        return output_hidden_states
+
+
+class DeepseekPipelineForwards:
+    """
+    This class serves as a micro library for forward function substitution of Llama models
+    under pipeline setting.
+    """
+
+    @staticmethod
+    def deepseek_model_forward(
+        self: "DeepseekModel",
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        stage_manager: Optional[PipelineStageManager] = None,
+        hidden_states: Optional[torch.FloatTensor] = None,
+        stage_index: Optional[List[int]] = None,
+        shard_config: ShardConfig = None,
+    ):
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+        >>> model = AutoModelForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        logger = logging.get_logger(__name__)
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if stage_manager.is_first_stage():
+            # retrieve input_ids and inputs_embeds
+            if input_ids is not None and inputs_embeds is not None:
+                raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+            elif input_ids is not None:
+                batch_size, seq_length = input_ids.shape
+            elif inputs_embeds is not None:
+                batch_size, seq_length, _ = inputs_embeds.shape
+            else:
+                raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            if inputs_embeds is None:
+                inputs_embeds = self.embed_tokens(input_ids)
+            hidden_states = inputs_embeds
+        else:
+            input_shape = hidden_states.shape[:-1]
+            batch_size, seq_length = input_shape
+            device = hidden_states.device
+
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+
+        # TODO(jianghai): left the recording kv-value tensors as () or None type, this feature may be added in the future.
+        if output_attentions:
+            logger.warning_once("output_attentions=True is not supported for pipeline models at the moment.")
+            output_attentions = False
+        if output_hidden_states:
+            logger.warning_once("output_hidden_states=True is not supported for pipeline models at the moment.")
+            output_hidden_states = False
+        if use_cache:
+            logger.warning_once("use_cache=True is not supported for pipeline models at the moment.")
+            use_cache = False
+
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+
+        if position_ids is None:
+            position_ids = torch.arange(
+                past_key_values_length,
+                seq_length + past_key_values_length,
+                dtype=torch.long,
+                device=device,
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        # embed positions, for the first stage, hidden_states is the input embeddings,
+        # for the other stages, hidden_states is the output of the previous stage
+        if is_flash_attn_2_available():
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                hidden_states,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        start_idx, end_idx = stage_index[0], stage_index[1]
+        for idx, decoder_layer in enumerate(self.layers[start_idx:end_idx], start=start_idx):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_value,
+                    output_attentions,
+                    use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        if stage_manager.is_last_stage():
+            hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+
+        if stage_manager.is_last_stage():
+            if not return_dict:
+                return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+            return BaseModelOutputWithPast(
+                last_hidden_state=hidden_states,
+                past_key_values=next_cache,
+                hidden_states=all_hidden_states,
+                attentions=all_self_attns,
+            )
+        # always return dict for imediate stage
+        return {
+            "hidden_states": hidden_states,
+        }
+
+    @staticmethod
+    def deepseek_for_causal_lm_forward(
+        self: "DeepseekForCausalLM",
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        stage_manager: Optional[PipelineStageManager] = None,
+        hidden_states: Optional[torch.FloatTensor] = None,
+        stage_index: Optional[List[int]] = None,
+        shard_config: ShardConfig = None,
+    ):
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, MixtralForCausalLM
+
+        >>> model = DeepseekForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        logger = logging.get_logger(__name__)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # TODO(jianghai): left the recording kv-value tensors as () or None type, this feature may be added in the future.
+        if output_attentions:
+            logger.warning_once("output_attentions=True is not supported for pipeline models at the moment.")
+            output_attentions = False
+        if output_hidden_states:
+            logger.warning_once("output_hidden_states=True is not supported for pipeline models at the moment.")
+            output_hidden_states = False
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = DeepseekPipelineForwards.deepseek_model_forward(
+            self.model,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            stage_manager=stage_manager,
+            hidden_states=hidden_states,
+            stage_index=stage_index,
+        )
+        past_key_values = None
+
+        if stage_manager.is_last_stage():
+            hidden_states = outputs[0]
+            logits = self.lm_head(hidden_states)
+            logits = logits.float()
+
+            loss = None
+            if labels is not None:
+                # Shift so that tokens < n predict n
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+                # Flatten the tokens
+                loss_fct = CrossEntropyLoss()
+                shift_logits = shift_logits.view(-1, self.config.vocab_size)
+                shift_labels = shift_labels.view(-1)
+                # Enable model parallelism
+                shift_labels = shift_labels.to(shift_logits.device)
+                loss = loss_fct(shift_logits, shift_labels)
+
+            if not return_dict:
+                output = (logits,) + outputs[1:]
+                return (loss,) + output if loss is not None else output
+
+            return CausalLMOutputWithPast(
+                loss=loss,
+                logits=logits,
+                past_key_values=None,
+                hidden_states=outputs[0],
+                attentions=None,
+            )
+        else:
+            out = {}
+            hidden_states = outputs.get("hidden_states")
+            out["hidden_states"] = hidden_states
+            return out
+
+
+def get_deepseek_flash_attention_forward(shard_config, sp_mode=None, sp_size=None, sp_group=None):
+    logger = logging.get_logger(__name__)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if sp_mode is not None:
+            assert sp_mode in ["all_to_all", "split_gather", "ring"], "Invalid sp_mode"
+            assert (sp_size is not None) and (
+                sp_group is not None
+            ), "Must specify sp_size and sp_group for sequence parallel"
+
+        # DeepseekFlashAttention2 attention does not support output_attentions
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop("padding_mask")
+
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        # sp: modify sp_len when sequence parallel mode is ring
+        if sp_mode in ["split_gather", "ring"]:
+            q_len *= sp_size
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # sp: all-to-all comminucation when introducing sequence parallel
+        if sp_mode == "all_to_all":
+            query_states = all_to_all_comm(query_states, sp_group)
+            key_states = all_to_all_comm(key_states, sp_group)
+            value_states = all_to_all_comm(value_states, sp_group)
+            bsz, q_len, _ = query_states.size()
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(
+            query_states, key_states, cos, sin, position_ids, unsqueeze_dim=0
+        )
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        dropout_rate = self.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (DeepseekRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            # Handle the case where the model is quantized
+            if hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            elif torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        attn_output = self._flash_attention_forward(
+            query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
+        )
+        # sp: all-to-all comminucation when introducing sequence parallel
+        if sp_mode == "all_to_all":
+            attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim).contiguous()  # (1, 8, 128)
+            attn_output = all_to_all_comm(attn_output, sp_group, scatter_dim=1, gather_dim=2)  # (1, 4, 256)
+        else:
+            attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+    return forward
+
+
+def get_deepseek_flash_attention_model_forward(shard_config, sp_mode=None, sp_size=None, sp_group=None):
+    logger = logging.get_logger(__name__)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape[:2]
+        elif inputs_embeds is not None:
+            batch_size, seq_length = inputs_embeds.shape[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`transformers."
+                )
+                use_cache = False
+
+        past_key_values_length = 0
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if self._use_flash_attention_2:
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._use_sdpa and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+            )
+
+        if sp_mode in ["ring", "split_gather"]:
+            inputs_embeds = split_forward_gather_backward(inputs_embeds, 1, sp_group)
+        elif sp_mode == "all_to_all":
+            inputs_embeds = split_forward_gather_backward(inputs_embeds, 1, sp_group, 1 / sp_size)
+        # embed positions
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        if sp_mode == "ring" or sp_mode == "split_gather":
+            hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group)
+        elif sp_mode == "all_to_all":
+            hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group, grad_scale=sp_size)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+    return forward
diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index beaa47952c9f..0dbf0ca5af36 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -25,7 +25,7 @@
 from colossalai.shardformer.layer._operation import gather_forward_split_backward, split_forward_gather_backward
 from colossalai.shardformer.shard import ShardConfig
 
-from ..layer import cross_entropy_1d
+from ..layer import dist_cross_entropy
 
 logger = logging.get_logger(__name__)
 
@@ -372,27 +372,9 @@ def gpt2_lmhead_model_forward(
 
         hidden_states = outputs[0]
         lm_logits = self.lm_head(hidden_states)
-        loss = None
-        if labels is not None:
-            # move labels to correct device to enable model parallelism
-            labels = labels.to(lm_logits.device)
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, shift_logits.size(-1))
-            shift_labels = shift_labels.view(-1)
-            if shard_config.enable_tensor_parallelism and shard_config.parallel_output:
-                loss = cross_entropy_1d(
-                    shift_logits,
-                    shift_labels,
-                    process_group=shard_config.tensor_parallel_process_group,
-                    vocab_size=self.lm_head.out_features,
-                    dtype=self.transformer.dtype,
-                )
-            else:
-                loss = loss_fct(shift_logits, shift_labels)
+        loss = dist_cross_entropy(
+            labels, lm_logits, shard_config, self.lm_head.out_features, self.config.vocab_size, self.transformer.dtype
+        )
 
         if not return_dict:
             output = (lm_logits,) + outputs[1:]
@@ -1284,24 +1266,9 @@ def forward(
         hidden_states = transformer_outputs[0]
 
         lm_logits = self.lm_head(hidden_states)
-
-        loss = None
-        if labels is not None:
-            # move labels to correct device to enable model parallelism
-            labels = labels.to(lm_logits.device)
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            shift_logits = shift_logits.view(-1, shift_logits.size(-1))
-            shift_labels = shift_labels.view(-1)
-            loss = cross_entropy_1d(
-                shift_logits,
-                shift_labels,
-                process_group=shard_config.tensor_parallel_process_group,
-                vocab_size=self.lm_head.out_features,
-                dtype=self.transformer.dtype,
-            )
+        loss = dist_cross_entropy(
+            labels, lm_logits, shard_config, self.lm_head.out_features, self.config.vocab_size, self.transformer.dtype
+        )
 
         if not return_dict:
             output = (lm_logits,) + transformer_outputs[1:]
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index f83735f0520d..693f6584f3a7 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -31,7 +31,7 @@
 )
 from colossalai.shardformer.shard import ShardConfig
 
-from ..layer import ColoAttention, cross_entropy_1d
+from ..layer import ColoAttention, dist_cross_entropy
 
 
 class LlamaPipelineForwards:
@@ -86,13 +86,20 @@ def llama_model_forward(
             device = input_ids.device if input_ids is not None else inputs_embeds.device
             if inputs_embeds is None:
                 inputs_embeds = self.embed_tokens(input_ids)
-
             hidden_states = inputs_embeds
         else:
             input_shape = hidden_states.shape[:-1]
             batch_size, seq_length = input_shape
             device = hidden_states.device
 
+        # Support SP + PP
+        sp_mode = shard_config.sequence_parallelism_mode
+        sp_group = shard_config.sequence_parallel_process_group
+        sp_size = shard_config.sequence_parallel_size
+        if sp_mode == "all_to_all" and not stage_manager.is_first_stage():
+            # For correct positions ids. The states will be gather along the seq dim in the attention layer later.
+            seq_length *= sp_size
+
         past_seen_tokens = 0
         if use_cache:  # kept for BC (cache positions)
             if not isinstance(past_key_values, StaticCache):
@@ -101,7 +108,7 @@ def llama_model_forward(
         if cache_position is None:
             if isinstance(past_key_values, StaticCache):
                 raise ValueError("cache_position is a required argument when using StaticCache.")
-            cache_position = torch.arange(past_seen_tokens, past_seen_tokens + hidden_states.shape[1], device=device)
+            cache_position = torch.arange(past_seen_tokens, past_seen_tokens + seq_length, device=device)
 
         seq_length_with_past = seq_length + past_seen_tokens
 
@@ -118,7 +125,6 @@ def llama_model_forward(
 
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
-
         # embed positions, for the first stage, hidden_states is the input embeddings,
         # for the other stages, hidden_states is the output of the previous stage
         if shard_config.enable_flash_attention:
@@ -134,6 +140,13 @@ def llama_model_forward(
         else:
             attention_mask = self._update_causal_mask(attention_mask, hidden_states, cache_position)
 
+        # Support SP + PP
+        if stage_manager.is_first_stage():
+            if sp_mode in ["ring", "split_gather"]:
+                hidden_states = split_forward_gather_backward(hidden_states, 1, sp_group)
+            elif sp_mode == "all_to_all":
+                hidden_states = split_forward_gather_backward(hidden_states, 1, sp_group, 1 / sp_size)
+
         if self.gradient_checkpointing and self.training and use_cache:
             if use_cache:
                 logger.warning_once(
@@ -196,6 +209,10 @@ def llama_model_forward(
 
         if stage_manager.is_last_stage():
             hidden_states = self.norm(hidden_states)
+            if sp_mode == "ring" or sp_mode == "split_gather":
+                hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group)
+            elif sp_mode == "all_to_all":
+                hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group, grad_scale=sp_size)
 
         # add hidden states from the last decoder layer
         if output_hidden_states:
@@ -304,29 +321,9 @@ def llama_for_causal_lm_forward(
         if stage_manager.is_last_stage():
             hidden_states = outputs[0]
             logits = self.lm_head(hidden_states)
-            loss = None
-            if labels is not None:
-                # Shift so that tokens < n predict n
-                shift_logits = logits[..., :-1, :].contiguous()
-                shift_labels = labels[..., 1:].contiguous()
-                # Flatten the tokens
-                loss_fct = CrossEntropyLoss()
-                shift_labels = shift_labels.view(-1)
-                # Enable model parallelism
-                shift_labels = shift_labels.to(shift_logits.device)
-                if shard_config.enable_tensor_parallelism and shard_config.parallel_output:
-                    new_vocab_size = logits.shape[-1]
-                    shift_logits = shift_logits.view(-1, new_vocab_size)
-                    loss = cross_entropy_1d(
-                        shift_logits,
-                        shift_labels,
-                        process_group=shard_config.tensor_parallel_process_group,
-                        vocab_size=self.lm_head.out_features,
-                        dtype=self.model.dtype,
-                    )
-                else:
-                    shift_logits = shift_logits.view(-1, self.config.vocab_size)
-                    loss = loss_fct(shift_logits, shift_labels)
+            loss = dist_cross_entropy(
+                labels, logits, shard_config, self.lm_head.out_features, self.config.vocab_size, self.model.dtype
+            )
 
             if not return_dict:
                 output = (logits,) + outputs[1:]
@@ -529,7 +526,6 @@ def forward(
                 )
 
             kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-
         cos, sin = self.rotary_emb(value_states, position_ids)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
@@ -649,7 +645,7 @@ def forward(
 
         # in this case, attention_mask is a dict rather than a tensor
         if shard_config.enable_flash_attention:
-            mask_shape = (inputs_embeds.shape[0], 1, past_seen_tokens + seq_len, past_seen_tokens + seq_len)
+            mask_shape = (inputs_embeds.shape[0], 1, seq_len, past_seen_tokens + seq_len)
             attention_mask = ColoAttention.prepare_attn_kwargs(
                 mask_shape,
                 inputs_embeds.dtype,
@@ -814,24 +810,9 @@ def forward(
             logits = self.lm_head(hidden_states)
         logits = logits.float()
 
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            new_vocab_size = logits.shape[-1]
-            shift_logits = shift_logits.view(-1, new_vocab_size)
-            loss = cross_entropy_1d(
-                shift_logits,
-                shift_labels,
-                process_group=shard_config.tensor_parallel_process_group,
-                vocab_size=self.lm_head.out_features,
-                dtype=self.model.dtype,
-            )
-
+        loss = dist_cross_entropy(
+            labels, logits, shard_config, self.lm_head.out_features, self.config.vocab_size, self.model.dtype
+        )
         if not return_dict:
             output = (logits,) + outputs[1:]
             return (loss,) + output if loss is not None else output
diff --git a/colossalai/shardformer/modeling/mistral.py b/colossalai/shardformer/modeling/mistral.py
index 310c2d8e233a..ec1a8a00a58a 100644
--- a/colossalai/shardformer/modeling/mistral.py
+++ b/colossalai/shardformer/modeling/mistral.py
@@ -19,7 +19,7 @@
 from colossalai.pipeline.stage_manager import PipelineStageManager
 from colossalai.shardformer.shard import ShardConfig
 
-from ..layer import ColoAttention, cross_entropy_1d
+from ..layer import ColoAttention, dist_cross_entropy
 
 logger = logging.get_logger(__name__)
 
@@ -91,7 +91,7 @@ def mistral_model_forward(
 
         if shard_config.enable_flash_attention:
             # in this case, attention_mask is a dict rather than a tensor
-            mask_shape = (batch_size, 1, seq_length, seq_length)
+            mask_shape = (batch_size, 1, seq_length, seq_length + past_key_values_length)
             attention_mask = ColoAttention.prepare_attn_kwargs(
                 mask_shape,
                 hidden_states.dtype,
@@ -275,29 +275,9 @@ def mistral_for_causal_lm_forward(
             logits = self.lm_head(hidden_states)
             logits = logits.float()
 
-            loss = None
-            if labels is not None:
-                # Shift so that tokens < n predict n
-                shift_logits = logits[..., :-1, :].contiguous()
-                shift_labels = labels[..., 1:].contiguous()
-                # Flatten the tokens
-                loss_fct = CrossEntropyLoss()
-                shift_labels = shift_labels.view(-1)
-                # Enable model parallelism
-                shift_labels = shift_labels.to(shift_logits.device)
-                if shard_config.enable_tensor_parallelism and shard_config.parallel_output:
-                    new_vocab_size = logits.shape[-1]
-                    shift_logits = shift_logits.view(-1, new_vocab_size)
-                    loss = cross_entropy_1d(
-                        shift_logits,
-                        shift_labels,
-                        process_group=shard_config.tensor_parallel_process_group,
-                        vocab_size=self.lm_head.out_features,
-                        dtype=self.model.dtype,
-                    )
-                else:
-                    shift_logits = shift_logits.view(-1, self.config.vocab_size)
-                    loss = loss_fct(shift_logits, shift_labels)
+            loss = dist_cross_entropy(
+                labels, logits, shard_config, self.lm_head.out_features, self.config.vocab_size, self.model.dtype
+            )
 
             if not return_dict:
                 output = (logits,) + outputs[1:]
@@ -708,23 +688,9 @@ def forward(
         logits = self.lm_head(hidden_states)
         logits = logits.float()
 
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            new_vocab_size = logits.shape[-1]
-            shift_logits = shift_logits.view(-1, new_vocab_size)
-            loss = cross_entropy_1d(
-                shift_logits,
-                shift_labels,
-                process_group=shard_config.tensor_parallel_process_group,
-                vocab_size=self.lm_head.out_features,
-                dtype=self.model.dtype,
-            )
+        loss = dist_cross_entropy(
+            labels, logits, shard_config, self.lm_head.out_features, self.config.vocab_size, self.model.dtype
+        )
 
         if not return_dict:
             output = (logits,) + outputs[1:]
diff --git a/colossalai/shardformer/modeling/mixtral.py b/colossalai/shardformer/modeling/mixtral.py
index 2fbc34302cde..d30ce5ea85cc 100644
--- a/colossalai/shardformer/modeling/mixtral.py
+++ b/colossalai/shardformer/modeling/mixtral.py
@@ -1,52 +1,105 @@
-from typing import List, Optional
+import inspect
+import warnings
+from typing import List, Optional, Tuple, Union
 
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
 from torch.distributed import ProcessGroup
-
-# from colossalai.tensor.moe_tensor.moe_info import MoeParallelInfo
 from torch.nn import CrossEntropyLoss
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_attn_mask_utils import (
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
 from transformers.models.mixtral.modeling_mixtral import (
     MixtralSparseMoeBlock,
     MoeCausalLMOutputWithPast,
+    MoeModelOutputWithPast,
+    apply_rotary_pos_emb,
     load_balancing_loss_func,
+    repeat_kv,
 )
 from transformers.utils import is_flash_attn_2_available, logging
 
 from colossalai.lazy import LazyInitContext
-from colossalai.moe._operation import MoeInGradScaler, MoeOutGradScaler, all_to_all_uneven
+from colossalai.moe._operation import (
+    DPGradScalerIn,
+    DPGradScalerOut,
+    EPGradScalerIn,
+    EPGradScalerOut,
+    all_to_all_uneven,
+)
 from colossalai.pipeline.stage_manager import PipelineStageManager
+from colossalai.shardformer.layer._operation import (
+    all_to_all_comm,
+    gather_forward_split_backward,
+    split_forward_gather_backward,
+)
+from colossalai.shardformer.layer.linear import Linear1D_Col, Linear1D_Row
 from colossalai.shardformer.shard import ShardConfig
 from colossalai.shardformer.shard.utils import set_tensors_to_none
+from colossalai.tensor.moe_tensor.api import set_moe_tensor_ep_group
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func
+
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
 
 
 class EPMixtralSparseMoeBlock(MixtralSparseMoeBlock):
-    def __init__(self, config):
-        self.moe_info = None
-        super().__init__(config)
-
-    def setup_ep(self, ep_group: ProcessGroup):
-        ep_group = ep_group
-        self.ep_size = dist.get_world_size(ep_group) if ep_group is not None else 1
-        self.ep_rank = dist.get_rank(ep_group) if ep_group is not None else 0
-        assert self.num_experts % self.ep_size == 0
+    def __init__(self, *args, **kwargs):
+        raise RuntimeError(f"Please use `from_native_module` to create an instance of {self.__class__.__name__}")
+
+    def setup_process_groups(self, tp_group: ProcessGroup, moe_dp_group: ProcessGroup, ep_group: ProcessGroup):
+        assert tp_group is not None
+        assert moe_dp_group is not None
+        assert ep_group is not None
+
+        # setup ep group
+        self.ep_size = dist.get_world_size(ep_group)
+        self.ep_rank = dist.get_rank(ep_group)
         self.ep_group = ep_group
+
+        if self.num_experts % self.ep_size != 0:
+            raise ValueError("The number of experts must be divisible by the number of expert parallel groups.")
+
         self.num_experts_per_ep = self.num_experts // self.ep_size
         self.expert_start_idx = self.ep_rank * self.num_experts_per_ep
         held_experts = self.experts[self.expert_start_idx : self.expert_start_idx + self.num_experts_per_ep]
+
         set_tensors_to_none(self.experts, exclude=set(held_experts))
+
+        # setup moe_dp group
+        self.moe_dp_group = moe_dp_group
+        self.moe_dp_size = moe_dp_group.size()
+
+        # setup global tp group
+        self.tp_group = tp_group
+        if self.tp_group.size() > 1:
+            for expert in held_experts:
+                expert.w1 = Linear1D_Col.from_native_module(expert.w1, self.tp_group)
+                expert.w3 = Linear1D_Col.from_native_module(expert.w3, self.tp_group)
+                expert.w2 = Linear1D_Row.from_native_module(expert.w2, self.tp_group)
+
         for p in self.experts.parameters():
-            p.ep_group = ep_group
+            set_moe_tensor_ep_group(p, ep_group)
 
     @staticmethod
-    def from_native_module(module: MixtralSparseMoeBlock, *args, **kwargs) -> "EPMixtralSparseMoeBlock":
+    def from_native_module(
+        module: MixtralSparseMoeBlock,
+        tp_group: ProcessGroup,
+        moe_dp_group: ProcessGroup,
+        ep_group: ProcessGroup,
+        *args,
+        **kwargs,
+    ) -> "EPMixtralSparseMoeBlock":
+        # TODO: better init
         LazyInitContext.materialize(module)
         module.__class__ = EPMixtralSparseMoeBlock
-        # if "ep_group" in kwargs:
-        assert "ep_group" in kwargs, "You should pass ep_group in SubModuleReplacementDescription via shard_config!!"
-        module.setup_ep(kwargs["ep_group"])
+        module.setup_process_groups(tp_group, moe_dp_group, ep_group)
         return module
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -65,20 +118,31 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         selected_experts_idx = selected_experts.argsort()
         dispatch_states = hidden_states.repeat(self.top_k, 1)[selected_experts_idx]
         input_split_sizes = selected_experts.bincount(minlength=self.num_experts)
+
         output_split_sizes = torch.zeros_like(input_split_sizes)
         dist.all_to_all_single(output_split_sizes, input_split_sizes, group=self.ep_group)
 
+        with torch.no_grad():
+            activate_experts = output_split_sizes[: self.num_experts_per_ep].clone()
+            for i in range(1, self.ep_size):
+                activate_experts += output_split_sizes[i * self.num_experts_per_ep : (i + 1) * self.num_experts_per_ep]
+            activate_experts = (activate_experts > 0).float()
+        dist.all_reduce(activate_experts, group=self.moe_dp_group)
+
         input_split_list = input_split_sizes.view(self.ep_size, self.num_experts_per_ep).sum(dim=-1).tolist()
         output_split_list = output_split_sizes.view(self.ep_size, self.num_experts_per_ep).sum(dim=-1).tolist()
+
         output_states, _ = all_to_all_uneven(dispatch_states, input_split_list, output_split_list, self.ep_group)
         # compute expert output
-        output_states = MoeInGradScaler.apply(output_states, self.ep_size)
+        output_states = EPGradScalerIn.apply(output_states, self.ep_size)
         if output_states.size(0) > 0:
             if self.num_experts_per_ep == 1:
                 # no need to split
                 expert = self.experts[self.expert_start_idx]
+                output_states = DPGradScalerIn.apply(output_states, self.moe_dp_size, activate_experts[0])
                 output_states = expert.act_fn(expert.w1(output_states)) * expert.w3(output_states)
                 output_states = expert.w2(output_states)
+                output_states = DPGradScalerOut.apply(output_states, self.moe_dp_size, activate_experts[0])
             else:
                 output_states_splits = output_states.split(output_split_sizes.tolist())
                 output_states_list = []
@@ -86,12 +150,20 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                     if split_states.size(0) == 0:
                         continue
                     expert = self.experts[self.expert_start_idx + i % self.num_experts_per_ep]
+                    split_states = DPGradScalerIn.apply(
+                        split_states, self.moe_dp_size, activate_experts[i % self.num_experts_per_ep]
+                    )
                     split_states = expert.act_fn(expert.w1(split_states)) * expert.w3(split_states)
                     split_states = expert.w2(split_states)
+                    split_states = DPGradScalerOut.apply(
+                        split_states, self.moe_dp_size, activate_experts[i % self.num_experts_per_ep]
+                    )
                     output_states_list.append(split_states)
                 output_states = torch.cat(output_states_list)
-        output_states = MoeOutGradScaler.apply(output_states, self.ep_size)
+
+        output_states = EPGradScalerOut.apply(output_states, self.ep_size)
         dispatch_states, _ = all_to_all_uneven(output_states, output_split_list, input_split_list, self.ep_group)
+
         recover_experts_idx = torch.empty_like(selected_experts_idx)
         recover_experts_idx[selected_experts_idx] = torch.arange(
             selected_experts_idx.size(0), device=selected_experts_idx.device
@@ -107,7 +179,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
 class MixtralPipelineForwards:
     """
-    This class serves as a micro library for forward function substitution of Llama models
+    This class serves as a micro library for forward function substitution of Mixtral models
     under pipeline setting.
     """
 
@@ -300,16 +372,29 @@ def custom_forward(*inputs):
         if output_router_logits and past_router_logits is not None:
             all_router_logits = past_router_logits + all_router_logits
         if stage_manager.is_last_stage():
-            return tuple(
-                v
-                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
-                if v is not None
+            if not return_dict:
+                return tuple(
+                    v
+                    for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
+                    if v is not None
+                )
+            return MoeModelOutputWithPast(
+                last_hidden_state=hidden_states,
+                past_key_values=next_cache,
+                hidden_states=all_hidden_states,
+                attentions=all_self_attns,
+                router_logits=all_router_logits,
             )
-        # always return dict for imediate stage
-        return {
-            "hidden_states": hidden_states,
-            "past_router_logits": all_router_logits,
-        }
+        else:
+            if output_router_logits:
+                return {
+                    "hidden_states": hidden_states,
+                    "past_router_logits": all_router_logits,
+                }
+            else:
+                return {
+                    "hidden_states": hidden_states,
+                }
 
     @staticmethod
     def mixtral_for_causal_lm_forward(
@@ -441,3 +526,335 @@ def mixtral_for_causal_lm_forward(
             if output_router_logits:
                 out["past_router_logits"] = outputs["past_router_logits"]
             return out
+
+
+def get_mixtral_flash_attention_forward(shard_config, sp_mode=None, sp_size=None, sp_group=None):
+    logger = logging.get_logger(__name__)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
+        if sp_mode is not None:
+            assert sp_mode in ["all_to_all", "split_gather", "ring"], "Invalid sp_mode"
+            assert (sp_size is not None) and (
+                sp_group is not None
+            ), "Must specify sp_size and sp_group for sequence parallel"
+
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop("padding_mask")
+        bsz, q_len, _ = hidden_states.size()
+
+        # sp: modify sp_len when sequence parallel mode is ring
+        if sp_mode in ["split_gather", "ring"]:
+            q_len *= sp_size
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # sp: all-to-all comminucation when introducing sequence parallel
+        if sp_mode == "all_to_all":
+            query_states = all_to_all_comm(query_states, sp_group)
+            key_states = all_to_all_comm(key_states, sp_group)
+            value_states = all_to_all_comm(value_states, sp_group)
+            bsz, q_len, _ = query_states.size()
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        use_sliding_windows = (
+            _flash_supports_window_size
+            and getattr(self.config, "sliding_window", None) is not None
+            and kv_seq_len > self.config.sliding_window
+        )
+        if not _flash_supports_window_size:
+            logger.warning_once(
+                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
+                " make sure to upgrade flash-attn library."
+            )
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            cache_has_contents = past_key_value.get_seq_length(self.layer_idx) > 0
+            if (
+                getattr(self.config, "sliding_window", None) is not None
+                and kv_seq_len > self.config.sliding_window
+                and cache_has_contents
+            ):
+                slicing_tokens = 1 - self.config.sliding_window
+
+                past_key = past_key_value[self.layer_idx][0]
+                past_value = past_key_value[self.layer_idx][1]
+
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
+                        f" {past_key.shape}"
+                    )
+
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            use_sliding_windows=use_sliding_windows,
+        )
+
+        # sp: all-to-all comminucation when introducing sequence parallel
+        if sp_mode == "all_to_all":
+            attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim).contiguous()  # (1, 8, 128)
+            attn_output = all_to_all_comm(attn_output, sp_group, scatter_dim=1, gather_dim=2)  # (1, 4, 256)
+        else:
+            attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+
+    return forward
+
+
+def get_mixtral_flash_attention_model_forward(shard_config, sp_mode=None, sp_size=None, sp_group=None):
+    logger = logging.get_logger(__name__)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MoeModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        past_key_values_length = 0
+
+        if (self.gradient_checkpointing or sp_mode in ["ring", "all_to_all"]) and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Mixtral. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+        if self._attn_implementation == "flash_attention_2":
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._attn_implementation == "sdpa" and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+
+        if sp_mode in ["ring", "split_gather"]:
+            inputs_embeds = split_forward_gather_backward(inputs_embeds, 1, sp_group)
+        elif sp_mode == "all_to_all":
+            inputs_embeds = split_forward_gather_backward(inputs_embeds, 1, sp_group, 1 / sp_size)
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_router_logits = () if output_router_logits else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    output_router_logits,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    output_router_logits=output_router_logits,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+            if output_router_logits:
+                all_router_logits += (layer_outputs[-1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        if sp_mode == "ring" or sp_mode == "split_gather":
+            hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group)
+        elif sp_mode == "all_to_all":
+            hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group, grad_scale=sp_size)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
+                if v is not None
+            )
+        return MoeModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            router_logits=all_router_logits,
+        )
+
+    return forward
diff --git a/colossalai/shardformer/modeling/opt.py b/colossalai/shardformer/modeling/opt.py
index b250b4976ec6..636b46cc461d 100644
--- a/colossalai/shardformer/modeling/opt.py
+++ b/colossalai/shardformer/modeling/opt.py
@@ -22,7 +22,7 @@
 from colossalai.shardformer.layer import ColoAttention
 from colossalai.shardformer.shard import ShardConfig
 
-from ..layer import cross_entropy_1d
+from ..layer import dist_cross_entropy
 
 logger = logging.get_logger(__name__)
 
@@ -330,30 +330,14 @@ def opt_for_causal_lm_forward(
         )
         if stage_manager.is_last_stage():
             logits = self.lm_head(outputs[0]).contiguous()
-            loss = None
-            if labels is not None:
-                # move labels to correct device to enable model parallelism
-                labels = labels.to(logits.device)
-                # Shift so that tokens < n predict n
-                shift_logits = logits[..., :-1, :].contiguous()
-                shift_labels = labels[..., 1:].contiguous()
-                # Flatten the tokens
-
-                if shard_config.enable_tensor_parallelism and shard_config.parallel_output:
-                    new_vocab_size = logits.shape[-1]
-                    shift_logits = shift_logits.view(-1, new_vocab_size)
-                    shift_labels = shift_labels.view(-1)
-                    loss = cross_entropy_1d(
-                        shift_logits,
-                        shift_labels,
-                        process_group=shard_config.tensor_parallel_process_group,
-                        vocab_size=self.lm_head.out_features,
-                        dtype=self.model.decoder.dtype,
-                    )
-                else:
-                    loss_fct = CrossEntropyLoss()
-                    shift_logits = shift_logits.view(-1, self.config.vocab_size)
-                    loss = loss_fct(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1))
+            loss = dist_cross_entropy(
+                labels,
+                logits,
+                shard_config,
+                self.lm_head.out_features,
+                self.config.vocab_size,
+                self.model.decoder.dtype,
+            )
 
             if not return_dict:
                 output = (logits,) + outputs[1:]
@@ -971,26 +955,9 @@ def forward(
         )
 
         logits = self.lm_head(outputs[0]).contiguous()
-
-        loss = None
-        if labels is not None:
-            # move labels to correct device to enable model parallelism
-            labels = labels.to(logits.device)
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            new_vocab_size = logits.shape[-1]
-            shift_logits = shift_logits.view(-1, new_vocab_size)
-            loss = cross_entropy_1d(
-                shift_logits,
-                shift_labels,
-                process_group=shard_config.tensor_parallel_process_group,
-                vocab_size=self.lm_head.out_features,
-                dtype=self.model.decoder.dtype,
-            )
+        loss = dist_cross_entropy(
+            labels, logits, shard_config, self.lm_head.out_features, self.config.vocab_size, self.model.decoder.dtype
+        )
 
         if not return_dict:
             output = (logits,) + outputs[1:]
diff --git a/colossalai/shardformer/modeling/qwen2.py b/colossalai/shardformer/modeling/qwen2.py
index 11c26822f50a..538e96c32c6d 100644
--- a/colossalai/shardformer/modeling/qwen2.py
+++ b/colossalai/shardformer/modeling/qwen2.py
@@ -1,6 +1,8 @@
+import math
 from typing import List, Optional, Tuple, Union
 
 import torch
+from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
@@ -30,9 +32,14 @@
 from transformers.utils import logging
 
 from colossalai.pipeline.stage_manager import PipelineStageManager
+from colossalai.shardformer.layer._operation import (
+    all_to_all_comm,
+    gather_forward_split_backward,
+    split_forward_gather_backward,
+)
 from colossalai.shardformer.shard import ShardConfig
 
-from ..layer import ColoAttention, cross_entropy_1d
+from ..layer import ColoAttention, dist_cross_entropy
 
 
 class Qwen2PipelineForwards:
@@ -129,7 +136,7 @@ def qwen2_model_forward(
         # for the other stages, hidden_states is the output of the previous stage
         if shard_config.enable_flash_attention:
             # in this case, attention_mask is a dict rather than a tensor
-            mask_shape = (batch_size, 1, seq_length_with_past, seq_length_with_past)
+            mask_shape = (batch_size, 1, seq_length, seq_length_with_past)
             attention_mask = ColoAttention.prepare_attn_kwargs(
                 mask_shape,
                 hidden_states.dtype,
@@ -162,6 +169,21 @@ def qwen2_model_forward(
                     sliding_window=self.config.sliding_window,
                 )
 
+        if shard_config and shard_config.enable_sequence_parallelism:
+            if shard_config.sequence_parallelism_mode in ["split_gather", "ring"]:
+                hidden_states = split_forward_gather_backward(
+                    hidden_states,
+                    dim=1,
+                    process_group=shard_config.tensor_parallel_process_group,
+                )
+            elif shard_config.sequence_parallelism_mode == "all_to_all":
+                hidden_states = split_forward_gather_backward(
+                    hidden_states,
+                    dim=1,
+                    process_group=shard_config.sequence_parallel_process_group,
+                    grad_scale=1 / shard_config.sequence_parallel_size,
+                )
+
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
@@ -218,6 +240,20 @@ def qwen2_model_forward(
         if stage_manager.is_last_stage():
             hidden_states = self.norm(hidden_states)
 
+        if shard_config and shard_config.enable_sequence_parallelism:
+            if shard_config.sequence_parallelism_mode in ["split_gather", "ring"]:
+                hidden_states = gather_forward_split_backward(
+                    hidden_states,
+                    dim=1,
+                    process_group=shard_config.tensor_parallel_process_group,
+                )
+            elif shard_config.sequence_parallelism_mode == "all_to_all":
+                hidden_states = gather_forward_split_backward(
+                    hidden_states,
+                    dim=1,
+                    process_group=shard_config.sequence_parallel_process_group,
+                    grad_scale=shard_config.sequence_parallel_size,
+                )
         # add hidden states from the last decoder layer
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
@@ -317,25 +353,9 @@ def qwen2_for_causal_lm_forward(
         if stage_manager.is_last_stage():
             hidden_states = outputs[0]
             logits = self.lm_head(hidden_states)
-            loss = None
-            if labels is not None:
-                # Shift so that tokens < n predict n
-                shift_logits = logits[..., :-1, :].contiguous()
-                shift_labels = labels[..., 1:].contiguous()
-                # Flatten the tokens
-                loss_fct = CrossEntropyLoss()
-                shift_labels = shift_labels.view(-1)
-                # Enable model parallelism
-                shift_labels = shift_labels.to(shift_logits.device)
-                if shard_config.enable_tensor_parallelism:
-                    new_vocab_size = logits.shape[-1]
-                    shift_logits = shift_logits.view(-1, new_vocab_size)
-                    loss = cross_entropy_1d(
-                        shift_logits, shift_labels, process_group=shard_config.tensor_parallel_process_group
-                    )
-                else:
-                    shift_logits = shift_logits.view(-1, self.config.vocab_size)
-                    loss = loss_fct(shift_logits, shift_labels)
+            loss = dist_cross_entropy(
+                labels, logits, shard_config, self.lm_head.out_features, self.config.vocab_size, logits.dtype
+            )
 
             if not return_dict:
                 output = (logits,) + outputs[1:]
@@ -469,7 +489,7 @@ def qwen2_for_sequence_classification_forward(
             return {"hidden_states": hidden_states}
 
 
-def get_qwen2_flash_attention_forward(shard_config: ShardConfig):
+def get_qwen2_flash_attention_forward(shard_config: ShardConfig, sp_mode=None, sp_size=None, sp_group=None):
     def forward(
         self: Qwen2Attention,
         hidden_states: torch.Tensor,
@@ -480,11 +500,26 @@ def forward(
         use_cache: bool = False,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if sp_mode is not None:
+            assert sp_mode in ["all_to_all", "split_gather", "ring"], "Invalid sp_mode"
+            assert (sp_size is not None) and (
+                sp_group is not None
+            ), "Must specify sp_size and sp_group for sequence parallel"
+
         bsz, q_len, _ = hidden_states.size()
+        # sp: modify sp_len when sequence parallel mode is ring
+        if sp_mode in ["split_gather", "ring"]:
+            q_len *= sp_size
 
         query_states = self.q_proj(hidden_states)
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
+        # sp: all-to-all comminucation when introducing sequence parallel
+        if sp_mode == "all_to_all":
+            query_states = all_to_all_comm(query_states, sp_group)
+            key_states = all_to_all_comm(key_states, sp_group)
+            value_states = all_to_all_comm(value_states, sp_group)
+            bsz, q_len, _ = query_states.size()
 
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
@@ -538,10 +573,41 @@ def forward(
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
-        assert isinstance(attention_mask, dict), "Flash Attention Error: attention_mask should be a dict."
-        attn_output = ColoAttention.attention(query_states, key_states, value_states, **attention_mask)
+        if shard_config.enable_flash_attention:
+            assert isinstance(attention_mask, dict), "Flash Attention Error: attention_mask should be a dict."
+            attn_output = ColoAttention.attention(query_states, key_states, value_states, **attention_mask)
+        else:
+            attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+            if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                    f" {attn_weights.size()}"
+                )
+
+            if attention_mask is not None:
+                if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                    raise ValueError(
+                        f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                    )
+                attn_weights = attn_weights + attention_mask
+
+            # upcast attention to fp32
+            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+            attn_output = torch.matmul(attn_weights, value_states)
+
+            if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+                raise ValueError(
+                    f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                    f" {attn_output.size()}"
+                )
         attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        if sp_mode == "all_to_all":
+            attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim)
+            attn_output = all_to_all_comm(attn_output, sp_group, scatter_dim=1, gather_dim=2)
+        else:
+            attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
         attn_output = self.o_proj(attn_output)
 
         return attn_output, None, past_key_value
@@ -549,9 +615,8 @@ def forward(
     return forward
 
 
-def get_qwen2_model_forward_for_flash_attn(shard_config: ShardConfig):
+def get_qwen2_model_forward_for_flash_attn(shard_config: ShardConfig, sp_mode=None, sp_size=None, sp_group=None):
     logger = logging.get_logger(__name__)
-    assert shard_config.enable_flash_attention, "Flash Attention is not enabled."
 
     def forward(
         self,
@@ -586,6 +651,10 @@ def forward(
         seq_length_with_past = seq_length
         past_key_values_length = 0
 
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+
         if position_ids is None:
             device = input_ids.device if input_ids is not None else inputs_embeds.device
             position_ids = torch.arange(
@@ -601,17 +670,26 @@ def forward(
         # embed positions
         hidden_states = inputs_embeds
 
-        # in this case, attention_mask is a dict rather than a tensor
-        mask_shape = (batch_size, 1, seq_length_with_past, seq_length_with_past)
-        attention_mask = ColoAttention.prepare_attn_kwargs(
-            mask_shape,
-            hidden_states.dtype,
-            hidden_states.device,
-            q_padding_mask=attention_mask,
-            is_causal=True,
-        )
+        if shard_config.enable_flash_attention:
+            # in this case, attention_mask is a dict rather than a tensor
+            mask_shape = (batch_size, 1, seq_length, seq_length_with_past)
+            attention_mask = ColoAttention.prepare_attn_kwargs(
+                mask_shape,
+                hidden_states.dtype,
+                hidden_states.device,
+                q_padding_mask=attention_mask,
+                is_causal=True,
+            )
+        else:
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
 
-        if self.gradient_checkpointing and self.training:
+        if (self.gradient_checkpointing or sp_mode in ["ring", "all_to_all"]) and self.training:
             if use_cache:
                 logger.warning_once(
                     "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
@@ -623,6 +701,11 @@ def forward(
         all_self_attns = () if output_attentions else None
         next_decoder_cache = None
 
+        if sp_mode in ["ring", "split_gather"]:
+            hidden_states = split_forward_gather_backward(hidden_states, 1, sp_group)
+        elif sp_mode == "all_to_all":
+            hidden_states = split_forward_gather_backward(hidden_states, 1, sp_group, 1 / sp_size)
+
         for decoder_layer in self.layers:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
@@ -657,6 +740,11 @@ def forward(
 
         hidden_states = self.norm(hidden_states)
 
+        if sp_mode == "ring" or sp_mode == "split_gather":
+            hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group)
+        elif sp_mode == "all_to_all":
+            hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group, grad_scale=sp_size)
+
         # add hidden states from the last decoder layer
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
@@ -737,26 +825,9 @@ def forward(
         hidden_states = outputs[0]
         logits = self.lm_head(hidden_states)
         logits = logits.float()
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            if shard_config.enable_tensor_parallelism:
-                new_vocab_size = logits.shape[-1]
-                shift_logits = shift_logits.view(-1, new_vocab_size)
-                loss = cross_entropy_1d(
-                    shift_logits, shift_labels, process_group=shard_config.tensor_parallel_process_group
-                )
-            else:
-                shift_logits = shift_logits.view(-1, self.config.vocab_size)
-                loss = loss_fct(shift_logits, shift_labels)
+        loss = dist_cross_entropy(
+            labels, logits, shard_config, self.lm_head.out_features, self.config.vocab_size, logits.dtype
+        )
 
         if not return_dict:
             output = (logits,) + outputs[1:]
diff --git a/colossalai/shardformer/policies/auto_policy.py b/colossalai/shardformer/policies/auto_policy.py
index bf139c840985..7b9c759a66c2 100644
--- a/colossalai/shardformer/policies/auto_policy.py
+++ b/colossalai/shardformer/policies/auto_policy.py
@@ -160,6 +160,13 @@ class PolicyLocation:
     "transformers_modules.modeling_chatglm.ChatGLMForConditionalGeneration": PolicyLocation(
         file_name="chatglm2", class_name="ChatGLMForConditionalGenerationPolicy"
     ),
+    # Deepseek
+    "transformers_modules.modeling_deepseek.DeepseekModel": PolicyLocation(
+        file_name="deepseek", class_name="DeepseekModelPolicy"
+    ),
+    "transformers_modules.modeling_deepseek.DeepseekForCausalLM": PolicyLocation(
+        file_name="deepseek", class_name="DeepseekForCausalLMPolicy"
+    ),
     # Falcon
     "transformers.models.falcon.modeling_falcon.FalconModel": PolicyLocation(
         file_name="falcon", class_name="FalconModelPolicy"
@@ -193,6 +200,9 @@ class PolicyLocation:
     "transformers.models.mixtral.modeling_mixtral.MixtralForCausalLM": PolicyLocation(
         file_name="mixtral", class_name="MixtralForCausalLMPolicy"
     ),
+    "transformers.models.mixtral.modeling_mixtral.MixtralForSequenceClassification": PolicyLocation(
+        file_name="mixtral", class_name="MixtralForSequenceClassificationPolicy"
+    ),
     # Qwen2
     "transformers.models.qwen2.modeling_qwen2.Qwen2Model": PolicyLocation(
         file_name="qwen2", class_name="Qwen2ModelPolicy"
@@ -233,6 +243,9 @@ def _fullname(obj):
     # patch custom models which are not in transformers
     # it can be like 'transformers_modules.THUDM.chatglm3-6b.103caa40027ebfd8450289ca2f278eac4ff26405.modeling_chatglm' (from huggingface hub)
     # or like 'transformers_modules.chatglm.modeling_chatglm' (from local directory)
+    if module.startswith("peft"):
+        klass = obj.base_model.model.__class__
+        module = klass.__module__
     if module.startswith("transformers_modules"):
         split_module = module.split(".")
         if len(split_module) >= 2:
@@ -252,7 +265,6 @@ def get_autopolicy(model: nn.Module) -> Policy:
     """
     full_name = _fullname(model)
     policy_location = _POLICY_LIST.get(full_name, None)
-
     if policy_location is None:
         raise NotImplementedError(
             f"Auto policy for {model.__class__.__qualname__} ({full_name}) is not implemented\n. Supported models are {list(_POLICY_LIST.keys())}"
diff --git a/colossalai/shardformer/policies/chatglm2.py b/colossalai/shardformer/policies/chatglm2.py
index 01aa77e57c00..3877bdac3ae2 100644
--- a/colossalai/shardformer/policies/chatglm2.py
+++ b/colossalai/shardformer/policies/chatglm2.py
@@ -9,6 +9,7 @@
 from colossalai.shardformer.modeling.chatglm2 import ChatGLMPipelineForwards
 
 from ..modeling.chatglm2 import (
+    get_chatglm_sequence_parallel_attention_forward,
     get_chatglm_sequence_parallel_forward_fn,
     get_flash_core_attention_forward,
     get_jit_fused_glm_block_forward,
@@ -58,14 +59,29 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                 norm_cls = col_nn.LayerNorm
 
         sp_mode = self.shard_config.sequence_parallelism_mode or None
-        assert sp_mode != "all_to_all", "all_to_all sequence parallelism is not supported for ChatGLM2"
+        sp_size = self.shard_config.sequence_parallel_size or None
+        sp_group = self.shard_config.sequence_parallel_process_group or None
+
         if sp_mode == "ring":
             warnings.warn(
                 f"For ChatGLM2, sequence parallelism is currently not support mode {sp_mode}, will set to be split_gather"
             )
             sp_mode = "split_gather"
         overlap = self.shard_config.enable_sequence_overlap
-        sp_partial_derived = sp_mode == "split_gather"
+        sp_partial_derived = sp_mode in ["split_gather"]
+
+        if sp_mode == "all_to_all":
+            decoder_attribute_replacement = {
+                "num_heads": self.model.config.num_attention_heads // sp_size,
+                "hidden_size_per_partition": self.model.config.kv_channels
+                * self.model.config.num_attention_heads
+                // sp_size,
+            }
+            if getattr(self.model.config, "num_key_value_heads", False):
+                decoder_attribute_replacement["num_key_value_heads"] = self.model.config.num_key_value_heads // sp_size
+            policy["CoreAttention"] = ModulePolicyDescription(
+                attribute_replacement=decoder_attribute_replacement,
+            )
 
         if self.shard_config.enable_tensor_parallelism:
             assert (
@@ -179,12 +195,26 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
             )
 
         # use sequence parallel
-        if sp_mode == "split_gather":
+        if self.shard_config.enable_sequence_parallelism:
             self.append_or_create_method_replacement(
-                description={"forward": get_chatglm_sequence_parallel_forward_fn(self.shard_config)},
+                description={
+                    "forward": get_chatglm_sequence_parallel_attention_forward(
+                        self.shard_config, sp_mode, sp_size, sp_group
+                    ),
+                },
                 policy=policy,
-                target_key="ChatGLMModel",
+                target_key="SelfAttention",
             )
+            if self.pipeline_stage_manager is None:
+                self.append_or_create_method_replacement(
+                    description={
+                        "forward": get_chatglm_sequence_parallel_forward_fn(
+                            self.shard_config, sp_mode, sp_size, sp_group
+                        )
+                    },
+                    policy=policy,
+                    target_key="ChatGLMModel",
+                )
 
         # use jit fused operator
         if self.shard_config.enable_jit_fused:
diff --git a/colossalai/shardformer/policies/command.py b/colossalai/shardformer/policies/command.py
index 902baf2e177c..a9b915d10485 100644
--- a/colossalai/shardformer/policies/command.py
+++ b/colossalai/shardformer/policies/command.py
@@ -1,4 +1,3 @@
-import warnings
 from functools import partial
 from typing import Callable, Dict, List, Union
 
@@ -66,13 +65,6 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
         else:
             norm_cls = LayerNorm
 
-        if self.pipeline_stage_manager is not None:
-            self.shard_config.enable_sequence_parallelism = False
-            self.shard_config.enable_sequence_overlap = False
-            self.shard_config.sequence_parallelism_mode = None
-            warnings.warn(
-                f"For Command, sequence parallelism is currently not compatible with pipeline parallelism, set to be False"
-            )
         sp_mode = self.shard_config.sequence_parallelism_mode or None
         sp_size = self.shard_config.sequence_parallel_size or None
         sp_group = self.shard_config.sequence_parallel_process_group or None
diff --git a/colossalai/shardformer/policies/deepseek.py b/colossalai/shardformer/policies/deepseek.py
new file mode 100644
index 000000000000..605f69c4a632
--- /dev/null
+++ b/colossalai/shardformer/policies/deepseek.py
@@ -0,0 +1,347 @@
+from functools import partial
+from typing import Callable, Dict, List, Union
+
+import torch.nn as nn
+from torch import Tensor
+from torch.nn import Module
+from transformers.utils import is_flash_attn_greater_or_equal_2_10
+
+from colossalai.shardformer.layer import FusedRMSNorm, Linear1D_Col
+from colossalai.shardformer.layer.embedding import PaddingEmbedding, VocabParallelEmbedding1D
+from colossalai.shardformer.layer.linear import Linear1D_Row
+from colossalai.shardformer.modeling.deepseek import (
+    DeepseekPipelineForwards,
+    EPDeepseekMoE,
+    get_deepseek_flash_attention_forward,
+    get_deepseek_flash_attention_model_forward,
+)
+from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
+
+__all__ = ["DeepseekPolicy", "DeepseekForCausalLMPolicy"]
+
+
+class DeepseekPolicy(Policy):
+    def config_sanity_check(self):
+        pass
+
+    def preprocess(self):
+        self.tie_weight = self.tie_weight_check()
+        self.origin_attn_implement = self.model.config._attn_implementation
+        """
+        Because transformers library's bug for AutoModel/AutoConfig, who pop “attn_implement” twice from modeling_utils.py and configuration_utils.py.
+        This bug causes attn_cls to be set to sdpa. Here we assign it to "flash_attention_2".
+        """
+        # self.origin_attn_implement =  "flash_attention_2"
+        if self.shard_config.enable_tensor_parallelism:
+            # Resize embedding
+            vocab_size = self.model.config.vocab_size
+            world_size = self.shard_config.tensor_parallel_size
+
+            if vocab_size % world_size != 0:
+                new_vocab_size = vocab_size + world_size - vocab_size % world_size
+                self.model.resize_token_embeddings(new_vocab_size)
+
+        return self.model
+
+    def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
+
+        ATTN_IMPLEMENTATION = {
+            "eager": "DeepseekAttention",
+            "flash_attention_2": "DeepseekFlashAttention2",
+            "sdpa": "DeepseekSdpaAttention",
+        }
+        policy = {}
+        attn_cls = ATTN_IMPLEMENTATION[self.origin_attn_implement]
+        sp_mode = self.shard_config.sequence_parallelism_mode or None
+        sp_size = self.shard_config.sequence_parallel_size or None
+        sp_group = self.shard_config.sequence_parallel_process_group or None
+        sp_partial_derived = sp_mode in ["split_gather", "ring"]
+        if sp_mode == "all_to_all":
+            decoder_attribute_replacement = {
+                "num_heads": self.model.config.num_attention_heads // sp_size,
+            }
+            if getattr(self.model.config, "num_key_value_heads", False):
+                decoder_attribute_replacement["num_key_value_heads"] = self.model.config.num_key_value_heads // sp_size
+
+            policy[attn_cls] = ModulePolicyDescription(
+                attribute_replacement=decoder_attribute_replacement,
+            )
+        if self.shard_config.enable_sequence_parallelism:
+            if self.pipeline_stage_manager is not None:
+                # NOTE: we are replacing model forward for both sequence parallelism and pipeline parallelism
+                # if both are enabled, one of them will be ignored
+                raise NotImplementedError("Sequence parallelism is not supported with pipeline parallelism.")
+            self.append_or_create_method_replacement(
+                description={
+                    "forward": get_deepseek_flash_attention_forward(self.shard_config, sp_mode, sp_size, sp_group),
+                },
+                policy=policy,
+                target_key=attn_cls,
+            )
+            if self.pipeline_stage_manager is None:
+                self.append_or_create_method_replacement(
+                    description={
+                        "forward": get_deepseek_flash_attention_model_forward(
+                            self.shard_config,
+                            sp_mode=sp_mode,
+                            sp_size=sp_size,
+                            sp_group=sp_group,
+                        ),
+                    },
+                    policy=policy,
+                    target_key="DeepseekModel",
+                )
+        embedding_cls = None
+        if self.shard_config.enable_tensor_parallelism:
+            embedding_cls = VocabParallelEmbedding1D
+        else:
+            if self.tie_weight:
+                embedding_cls = PaddingEmbedding
+        if self.shard_config.enable_tensor_parallelism:
+            # tensor parallelism for non-moe params
+            assert (
+                self.model.config.num_attention_heads % self.shard_config.tensor_parallel_size == 0
+            ), f"The number of attention heads must be divisible by tensor parallel size."
+            assert (
+                self.model.config.num_key_value_heads % self.shard_config.tensor_parallel_size == 0
+            ), f"The number of key_value heads must be divisible by tensor parallel size."
+            decoder_attribute_replacement = {
+                "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
+                "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
+                "self_attn.num_key_value_heads": self.model.config.num_key_value_heads
+                // self.shard_config.tensor_parallel_size,
+            }
+
+            policy["DeepseekDecoderLayer"] = ModulePolicyDescription(
+                attribute_replacement=decoder_attribute_replacement,
+                sub_module_replacement=[
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.q_proj",
+                        target_module=Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.k_proj",
+                        target_module=Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.v_proj",
+                        target_module=Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.o_proj",
+                        target_module=Linear1D_Row,
+                    ),
+                ],
+            )
+        if embedding_cls is not None:
+            self.append_or_create_submodule_replacement(
+                description=SubModuleReplacementDescription(
+                    suffix="embed_tokens",
+                    target_module=embedding_cls,
+                    kwargs={"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by},
+                ),
+                policy=policy,
+                target_key="DeepseekModel",
+            )
+
+        if self.shard_config.ep_group:
+            # expert parallel
+            self.append_or_create_submodule_replacement(
+                description=[
+                    SubModuleReplacementDescription(
+                        suffix="mlp",
+                        target_module=EPDeepseekMoE,
+                        kwargs={
+                            "ep_group": self.shard_config.ep_group,
+                            "tp_group": self.shard_config.tensor_parallel_process_group,
+                            "moe_dp_group": self.shard_config.moe_dp_group,
+                        },
+                    )
+                ],
+                policy=policy,
+                target_key="DeepseekDecoderLayer",
+            )
+
+        # optimization configuration
+        if self.shard_config.enable_fused_normalization:
+            self.append_or_create_submodule_replacement(
+                description=[
+                    SubModuleReplacementDescription(
+                        suffix="input_layernorm",
+                        target_module=FusedRMSNorm,
+                        kwargs={"sp_partial_derived": sp_partial_derived},
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="post_attention_layernorm",
+                        target_module=FusedRMSNorm,
+                        kwargs={"sp_partial_derived": sp_partial_derived},
+                    ),
+                ],
+                policy=policy,
+                target_key="DeepseekDecoderLayer",
+            )
+
+            self.append_or_create_submodule_replacement(
+                description=SubModuleReplacementDescription(
+                    suffix="norm",
+                    target_module=FusedRMSNorm,
+                    kwargs={"sp_partial_derived": sp_partial_derived},
+                ),
+                policy=policy,
+                target_key="DeepseekModel",
+            )
+
+        if self.shard_config.enable_flash_attention:
+            # NOTE: there is a bug for toggling flash attention in AutoModel, which has to be used for deepseek right now
+            from transformers.dynamic_module_utils import get_class_from_dynamic_module
+
+            flash_attn_cls = get_class_from_dynamic_module(
+                "deepseek-ai/deepseek-moe-16b-base--modeling_deepseek.DeepseekFlashAttention2",
+                "deepseek-ai/deepseek-moe-16b-base",
+            )
+
+            class TargetFlashAttn:
+                def __init__(self):
+                    raise RuntimeError("This class should not be instantiated")
+
+                @staticmethod
+                def from_native_module(original_attn: nn.Module, *args, **kwargs) -> nn.Module:
+                    original_attn.__class__ = flash_attn_cls
+                    original_attn._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+                    return original_attn
+
+            self.append_or_create_submodule_replacement(
+                description=SubModuleReplacementDescription(
+                    suffix="self_attn",
+                    target_module=TargetFlashAttn,
+                ),
+                policy=policy,
+                target_key="DeepseekDecoderLayer",
+            )
+        return policy
+
+    def postprocess(self):
+        return self.model
+
+    def set_pipeline_forward(self, model_cls: nn.Module, new_forward: Callable, policy: Dict) -> None:
+        """If under pipeline parallel setting, replacing the original forward method of huggingface
+        to customized forward method, and add this changing to policy."""
+        if self.pipeline_stage_manager:
+            if self.shard_config.enable_sequence_parallelism:
+                # NOTE: we are replacing model forward for both sequence parallelism and pipeline parallelism
+                # if both are enabled, one of them will be ignored
+                raise NotImplementedError("Pipeline parallelism is not supported with sequence parallelism.")
+            stage_manager = self.pipeline_stage_manager
+            if self.model.__class__.__name__ == "DeepseekModel":
+                module = self.model
+            else:
+                module = self.model.model
+
+            layers_per_stage = stage_manager.distribute_layers(len(module.layers))
+            stage_index = stage_manager.get_stage_index(layers_per_stage)
+            method_replacement = {"forward": partial(new_forward, stage_manager=stage_manager, stage_index=stage_index)}
+            self.append_or_create_method_replacement(
+                description=method_replacement, policy=policy, target_key=model_cls
+            )
+
+        return
+
+    def get_held_layers(self) -> List[Module]:
+        """Get pipeline layers for current stage."""
+        assert self.pipeline_stage_manager is not None
+
+        if self.model.__class__.__name__ == "DeepseekModel":
+            module = self.model
+        else:
+            module = self.model.model
+        stage_manager = self.pipeline_stage_manager
+
+        held_layers = []
+        layers_per_stage = stage_manager.distribute_layers(len(module.layers))
+        if stage_manager.is_first_stage():
+            held_layers.append(module.embed_tokens)
+        start_idx, end_idx = stage_manager.get_stage_index(layers_per_stage)
+        held_layers.extend(module.layers[start_idx:end_idx])
+        if stage_manager.is_last_stage():
+            held_layers.append(module.norm)
+
+        return held_layers
+
+
+class DeepseekModelPolicy(DeepseekPolicy):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def module_policy(self):
+        policy = super().module_policy()
+        if self.pipeline_stage_manager:
+            # set None as default
+            self.set_pipeline_forward(
+                model_cls="DeepseekModel",
+                new_forward=DeepseekPipelineForwards.deepseek_model_forward,
+                policy=policy,
+            )
+        return policy
+
+    def get_held_layers(self) -> List[Module]:
+        """Get pipeline layers for current stage."""
+        held_layers = super().get_held_layers()
+        return held_layers
+
+    def get_shared_params(self) -> List[Dict[int, Tensor]]:
+        """No shared params in llama model"""
+        return []
+
+
+class DeepseekForCausalLMPolicy(DeepseekPolicy):
+    def module_policy(self):
+        policy = super().module_policy()
+        # TODO: assign pg mesh from plugin to all modules
+        if self.shard_config.enable_tensor_parallelism:
+            # add a new item for casual lm
+            new_item = {
+                "DeepseekForCausalLM": ModulePolicyDescription(
+                    sub_module_replacement=[
+                        SubModuleReplacementDescription(
+                            suffix="lm_head",
+                            target_module=Linear1D_Col,
+                            kwargs=dict(gather_output=True),
+                        )
+                    ]
+                )
+            }
+            policy.update(new_item)
+
+        if self.pipeline_stage_manager:
+            # set None as default
+            self.set_pipeline_forward(
+                model_cls="DeepseekForCausalLM",
+                new_forward=DeepseekPipelineForwards.deepseek_for_causal_lm_forward,
+                policy=policy,
+            )
+
+        return policy
+
+    def get_held_layers(self) -> List[Module]:
+        """Get pipeline layers for current stage."""
+        stage_manager = self.pipeline_stage_manager
+        held_layers = super().get_held_layers()
+        if stage_manager.is_last_stage():
+            held_layers.append(self.model.lm_head)
+        return held_layers
+
+    def get_shared_params(self) -> List[Dict[int, Tensor]]:
+        deepseek_model = self.model.model
+        if self.pipeline_stage_manager and self.pipeline_stage_manager.num_stages > 1:
+            if (
+                id(deepseek_model.embed_tokens.weight) == id(self.model.lm_head.weight)
+                and self.pipeline_stage_manager.num_stages > 1
+            ):
+                # tie weights
+                return [
+                    {
+                        0: deepseek_model.embed_tokens.weight,
+                        self.pipeline_stage_manager.num_stages - 1: self.model.lm_head.weight,
+                    }
+                ]
+        return []
diff --git a/colossalai/shardformer/policies/llama.py b/colossalai/shardformer/policies/llama.py
index 0f28f6cf49a9..6f8404219364 100644
--- a/colossalai/shardformer/policies/llama.py
+++ b/colossalai/shardformer/policies/llama.py
@@ -1,4 +1,3 @@
-import warnings
 from functools import partial
 from typing import Callable, Dict, List, Union
 
@@ -65,13 +64,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
             norm_cls = FusedRMSNorm
         else:
             norm_cls = RMSNorm
-        if self.pipeline_stage_manager is not None:
-            self.shard_config.enable_sequence_parallelism = False
-            self.shard_config.enable_sequence_overlap = False
-            self.shard_config.sequence_parallelism_mode = None
-            warnings.warn(
-                f"For llama, sequence parallelism is currently not compatible with pipeline parallelism, set to be False"
-            )
+
         sp_mode = self.shard_config.sequence_parallelism_mode or None
         sp_size = self.shard_config.sequence_parallel_size or None
         sp_group = self.shard_config.sequence_parallel_process_group or None
diff --git a/colossalai/shardformer/policies/mixtral.py b/colossalai/shardformer/policies/mixtral.py
index 0fb858d78011..10df143c99da 100644
--- a/colossalai/shardformer/policies/mixtral.py
+++ b/colossalai/shardformer/policies/mixtral.py
@@ -1,13 +1,21 @@
+import warnings
 from functools import partial
 from typing import Callable, Dict, List, Union
 
 import torch.nn as nn
 from torch import Tensor
 from torch.nn import Module
-from transformers.models.mixtral.modeling_mixtral import MixtralDecoderLayer, MixtralForCausalLM, MixtralModel
+from transformers.models.mixtral.modeling_mixtral import MixtralForCausalLM, MixtralModel
 
 from colossalai.shardformer.layer import FusedRMSNorm, Linear1D_Col
-from colossalai.shardformer.modeling.mixtral import EPMixtralSparseMoeBlock, MixtralPipelineForwards
+from colossalai.shardformer.layer.embedding import PaddingEmbedding, VocabParallelEmbedding1D
+from colossalai.shardformer.layer.linear import Linear1D_Row
+from colossalai.shardformer.modeling.mixtral import (
+    EPMixtralSparseMoeBlock,
+    MixtralPipelineForwards,
+    get_mixtral_flash_attention_forward,
+    get_mixtral_flash_attention_model_forward,
+)
 from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
 
 __all__ = ["MixtralPolicy", "MixtralForCausalLMPolicy"]
@@ -18,36 +26,136 @@ def config_sanity_check(self):
         pass
 
     def preprocess(self):
-        if self.shard_config.enable_tensor_parallelism:
-            # Resize embedding
-            vocab_size = self.model.config.vocab_size
-            world_size = self.shard_config.tensor_parallel_size
-
-            if vocab_size % world_size != 0:
-                new_vocab_size = vocab_size + world_size - vocab_size % world_size
-                self.model.resize_token_embeddings(new_vocab_size)
-
+        self.tie_weight = self.tie_weight_check()
+        self.origin_attn_implement = self.model.config._attn_implementation
         return self.model
 
     def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
+        from transformers.models.mixtral.modeling_mixtral import (
+            MixtralAttention,
+            MixtralDecoderLayer,
+            MixtralFlashAttention2,
+            MixtralModel,
+            MixtralSdpaAttention,
+        )
+
+        ATTN_IMPLEMENTATION = {
+            "eager": MixtralAttention,
+            "flash_attention_2": MixtralFlashAttention2,
+            "sdpa": MixtralSdpaAttention,
+        }
         policy = {}
+        attn_cls = ATTN_IMPLEMENTATION[self.origin_attn_implement]
+
+        sp_mode = self.shard_config.sequence_parallelism_mode or None
+        sp_size = self.shard_config.sequence_parallel_size or None
+        sp_group = self.shard_config.sequence_parallel_process_group or None
+        sp_partial_derived = sp_mode in ["split_gather", "ring"]
+        if sp_mode == "all_to_all":
+            decoder_attribute_replacement = {
+                "num_heads": self.model.config.num_attention_heads // sp_size,
+            }
+            if getattr(self.model.config, "num_key_value_heads", False):
+                decoder_attribute_replacement["num_key_value_heads"] = self.model.config.num_key_value_heads // sp_size
 
+            policy[attn_cls] = ModulePolicyDescription(
+                attribute_replacement=decoder_attribute_replacement,
+            )
         if self.shard_config.enable_sequence_parallelism:
-            self.shard_config.enable_sequence_parallelism = False
-            raise NotImplementedError(
-                "Mixtral dosen't support sequence parallelism now, will ignore the sequence parallelism flag."
+            if self.pipeline_stage_manager is not None:
+                # NOTE: we are replacing model forward for both sequence parallelism and pipeline parallelism
+                # if both are enabled, one of them will be ignored
+                raise NotImplementedError("Sequence parallelism is not supported with pipeline parallelism.")
+            self.append_or_create_method_replacement(
+                description={
+                    "forward": get_mixtral_flash_attention_forward(self.shard_config, sp_mode, sp_size, sp_group),
+                },
+                policy=policy,
+                target_key=attn_cls,
+            )
+            self.append_or_create_method_replacement(
+                description={
+                    "forward": get_mixtral_flash_attention_model_forward(
+                        self.shard_config,
+                        sp_mode=sp_mode,
+                        sp_size=sp_size,
+                        sp_group=sp_group,
+                    ),
+                },
+                policy=policy,
+                target_key=MixtralModel,
             )
 
+        embedding_cls = None
         if self.shard_config.enable_tensor_parallelism:
-            raise NotImplementedError("Tensor parallelism is not supported for Mixtral model now.")
-        if getattr(self.shard_config, "ep_group", None) is not None:
+            embedding_cls = VocabParallelEmbedding1D
+        else:
+            if self.tie_weight:
+                embedding_cls = PaddingEmbedding
+
+        if self.shard_config.enable_tensor_parallelism:
+            # tensor parallelism for non-moe params
+            assert (
+                self.model.config.num_attention_heads % self.shard_config.tensor_parallel_size == 0
+            ), f"The number of attention heads must be divisible by tensor parallel size."
+            assert (
+                self.model.config.num_key_value_heads % self.shard_config.tensor_parallel_size == 0
+            ), f"The number of key_value heads must be divisible by tensor parallel size."
+            decoder_attribute_replacement = {
+                "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
+                "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
+                "self_attn.num_key_value_heads": self.model.config.num_key_value_heads
+                // self.shard_config.tensor_parallel_size,
+            }
+
+            policy[MixtralDecoderLayer] = ModulePolicyDescription(
+                attribute_replacement=decoder_attribute_replacement,
+                sub_module_replacement=[
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.q_proj",
+                        target_module=Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.k_proj",
+                        target_module=Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.v_proj",
+                        target_module=Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.o_proj",
+                        target_module=Linear1D_Row,
+                    ),
+                    SubModuleReplacementDescription(  # or replicate?
+                        suffix="block_sparse_moe.gate", target_module=Linear1D_Col, kwargs={"gather_output": True}
+                    ),
+                ],
+            )
+
+        if embedding_cls is not None:
+            self.append_or_create_submodule_replacement(
+                description=SubModuleReplacementDescription(
+                    suffix="embed_tokens",
+                    target_module=embedding_cls,
+                    kwargs={"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by},
+                ),
+                policy=policy,
+                target_key=MixtralModel,
+            )
+
+        if self.shard_config.ep_group:
             # expert parallel
             self.append_or_create_submodule_replacement(
                 description=[
                     SubModuleReplacementDescription(
                         suffix="block_sparse_moe",
                         target_module=EPMixtralSparseMoeBlock,
-                        kwargs={"ep_group": self.shard_config.ep_group},
+                        kwargs={
+                            "ep_group": self.shard_config.ep_group,
+                            "tp_group": self.shard_config.tensor_parallel_process_group,
+                            "moe_dp_group": self.shard_config.moe_dp_group,
+                        },
                     )
                 ],
                 policy=policy,
@@ -61,10 +169,12 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                     SubModuleReplacementDescription(
                         suffix="input_layernorm",
                         target_module=FusedRMSNorm,
+                        kwargs={"sp_partial_derived": sp_partial_derived},
                     ),
                     SubModuleReplacementDescription(
                         suffix="post_attention_layernorm",
                         target_module=FusedRMSNorm,
+                        kwargs={"sp_partial_derived": sp_partial_derived},
                     ),
                 ],
                 policy=policy,
@@ -75,13 +185,15 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                 description=SubModuleReplacementDescription(
                     suffix="norm",
                     target_module=FusedRMSNorm,
+                    kwargs={"sp_partial_derived": sp_partial_derived},
                 ),
                 policy=policy,
                 target_key=MixtralModel,
             )
 
         if self.shard_config.enable_flash_attention:
-            raise NotImplementedError("Flash attention has already been replaced in mixtral.")
+            warnings.warn("Flash attention is natively supported in transformers, will ignore the flag.")
+            self.shard_config.enable_flash_attention = False
 
         return policy
 
@@ -92,6 +204,10 @@ def set_pipeline_forward(self, model_cls: nn.Module, new_forward: Callable, poli
         """If under pipeline parallel setting, replacing the original forward method of huggingface
         to customized forward method, and add this changing to policy."""
         if self.pipeline_stage_manager:
+            if self.shard_config.enable_sequence_parallelism:
+                # NOTE: we are replacing model forward for both sequence parallelism and pipeline parallelism
+                # if both are enabled, one of them will be ignored
+                raise NotImplementedError("Pipeline parallelism is not supported with sequence parallelism.")
             stage_manager = self.pipeline_stage_manager
             if self.model.__class__.__name__ == "MixtralModel":
                 module = self.model
@@ -150,7 +266,7 @@ def get_held_layers(self) -> List[Module]:
         return held_layers
 
     def get_shared_params(self) -> List[Dict[int, Tensor]]:
-        """No shared params in llama model"""
+        """No shared params in mixtral model"""
         return []
 
 
@@ -192,17 +308,54 @@ def get_held_layers(self) -> List[Module]:
         return held_layers
 
     def get_shared_params(self) -> List[Dict[int, Tensor]]:
-        llama_model = self.model.model
+        mixtral_model = self.model.model
         if self.pipeline_stage_manager and self.pipeline_stage_manager.num_stages > 1:
             if (
-                id(llama_model.embed_tokens.weight) == id(self.model.lm_head.weight)
+                id(mixtral_model.embed_tokens.weight) == id(self.model.lm_head.weight)
                 and self.pipeline_stage_manager.num_stages > 1
             ):
                 # tie weights
                 return [
                     {
-                        0: llama_model.embed_tokens.weight,
+                        0: mixtral_model.embed_tokens.weight,
                         self.pipeline_stage_manager.num_stages - 1: self.model.lm_head.weight,
                     }
                 ]
         return []
+
+
+class MixtralForSequenceClassificationPolicy(MixtralPolicy):
+    def module_policy(self):
+        from transformers import MixtralForSequenceClassification
+
+        policy = super().module_policy()
+
+        if self.shard_config.enable_tensor_parallelism:
+            # add a new item for sequence classification
+            new_item = {
+                MixtralForSequenceClassification: ModulePolicyDescription(
+                    sub_module_replacement=[
+                        SubModuleReplacementDescription(
+                            suffix="score", target_module=Linear1D_Col, kwargs=dict(gather_output=True)
+                        )
+                    ]
+                )
+            }
+            policy.update(new_item)
+
+        if self.pipeline_stage_manager:
+            raise NotImplementedError
+
+        return policy
+
+    def get_held_layers(self) -> List[Module]:
+        """Get pipeline layers for current stage."""
+        stage_manager = self.pipeline_stage_manager
+        held_layers = super().get_held_layers()
+        if stage_manager.is_last_stage(ignore_chunk=True):
+            held_layers.append(self.model.score)
+        return held_layers
+
+    def get_shared_params(self) -> List[Dict[int, Tensor]]:
+        """No shared params in mixtral for sequence classification model"""
+        return []
diff --git a/colossalai/shardformer/policies/qwen2.py b/colossalai/shardformer/policies/qwen2.py
index 3e427c4a1623..362c14060fd9 100644
--- a/colossalai/shardformer/policies/qwen2.py
+++ b/colossalai/shardformer/policies/qwen2.py
@@ -1,4 +1,3 @@
-import warnings
 from functools import partial
 from typing import Callable, Dict, List, Union
 
@@ -82,9 +81,20 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                 embedding_cls = PaddingEmbedding
         norm_cls = FusedRMSNorm if self.shard_config.enable_fused_normalization else RMSNorm
 
-        if self.shard_config.enable_sequence_parallelism:
-            self.shard_config.enable_sequence_parallelism = False
-            warnings.warn("Qwen2 doesn't support sequence parallelism now, will ignore the sequence parallelism flag.")
+        sp_mode = self.shard_config.sequence_parallelism_mode or None
+        sp_size = self.shard_config.sequence_parallel_size or None
+        sp_group = self.shard_config.sequence_parallel_process_group or None
+        sp_partial_derived = sp_mode in ["split_gather", "ring"]
+        if sp_mode == "all_to_all":
+            decoder_attribute_replacement = {
+                "num_heads": self.model.config.num_attention_heads // sp_size,
+            }
+            if getattr(self.model.config, "num_key_value_heads", False):
+                decoder_attribute_replacement["num_key_value_heads"] = self.model.config.num_key_value_heads // sp_size
+
+            policy[attn_cls] = ModulePolicyDescription(
+                attribute_replacement=decoder_attribute_replacement,
+            )
 
         if self.shard_config.enable_tensor_parallelism:
             assert (
@@ -109,30 +119,37 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                     SubModuleReplacementDescription(
                         suffix="self_attn.q_proj",
                         target_module=Linear1D_Col,
+                        kwargs=dict(seq_parallel_mode=sp_mode),
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.k_proj",
                         target_module=Linear1D_Col,
+                        kwargs=dict(seq_parallel_mode=sp_mode),
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.v_proj",
                         target_module=Linear1D_Col,
+                        kwargs=dict(seq_parallel_mode=sp_mode),
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.o_proj",
                         target_module=Linear1D_Row,
+                        kwargs=dict(seq_parallel_mode=sp_mode),
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.gate_proj",
                         target_module=Linear1D_Col,
+                        kwargs=dict(seq_parallel_mode=sp_mode),
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.up_proj",
                         target_module=Linear1D_Col,
+                        kwargs=dict(seq_parallel_mode=sp_mode),
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.down_proj",
                         target_module=Linear1D_Row,
+                        kwargs=dict(seq_parallel_mode=sp_mode),
                     ),
                 ],
             )
@@ -154,10 +171,12 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                 SubModuleReplacementDescription(
                     suffix="input_layernorm",
                     target_module=norm_cls,
+                    kwargs={"sp_partial_derived": sp_partial_derived},
                 ),
                 SubModuleReplacementDescription(
                     suffix="post_attention_layernorm",
                     target_module=norm_cls,
+                    kwargs={"sp_partial_derived": sp_partial_derived},
                 ),
             ],
             policy=policy,
@@ -168,16 +187,16 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
             description=SubModuleReplacementDescription(
                 suffix="norm",
                 target_module=norm_cls,
+                kwargs={"sp_partial_derived": sp_partial_derived},
             ),
             policy=policy,
             target_key=Qwen2Model,
         )
 
-        # use flash attention
-        if self.shard_config.enable_flash_attention:
+        if self.shard_config.enable_flash_attention or self.shard_config.enable_sequence_parallelism:
             self.append_or_create_method_replacement(
                 description={
-                    "forward": get_qwen2_flash_attention_forward(self.shard_config),
+                    "forward": get_qwen2_flash_attention_forward(self.shard_config, sp_mode, sp_size, sp_group),
                 },
                 policy=policy,
                 target_key=attn_cls,
@@ -186,7 +205,9 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                 # replace qwen2 model forward method
                 self.append_or_create_method_replacement(
                     description={
-                        "forward": get_qwen2_model_forward_for_flash_attn(self.shard_config),
+                        "forward": get_qwen2_model_forward_for_flash_attn(
+                            self.shard_config, sp_mode, sp_size, sp_group
+                        ),
                     },
                     policy=policy,
                     target_key=Qwen2Model,
diff --git a/colossalai/shardformer/shard/shard_config.py b/colossalai/shardformer/shard/shard_config.py
index 7372e06c2444..8e33d786b6f9 100644
--- a/colossalai/shardformer/shard/shard_config.py
+++ b/colossalai/shardformer/shard/shard_config.py
@@ -47,6 +47,9 @@ class ShardConfig:
     make_vocab_size_divisible_by: int = 64
     gradient_checkpoint_config: Optional[GradientCheckpointConfig] = None
     extra_kwargs: Dict[str, Any] = field(default_factory=dict)
+
+    # for moe related
+    moe_dp_group: Optional[ProcessGroup] = None
     ep_group: Optional[ProcessGroup] = None
     fp8_communication: bool = False
     # pipeline_parallel_size: int
diff --git a/colossalai/shardformer/shard/shardformer.py b/colossalai/shardformer/shard/shardformer.py
index b54c5827316e..db03eec414c2 100644
--- a/colossalai/shardformer/shard/shardformer.py
+++ b/colossalai/shardformer/shard/shardformer.py
@@ -1,4 +1,3 @@
-import os
 from typing import Dict, List, Tuple
 
 import torch.distributed as dist
@@ -11,9 +10,6 @@
 from .shard_config import ShardConfig
 from .sharder import ModelSharder
 
-# set CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that when communication and computation overlap, the order of core scheduling is correct
-os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
-
 
 class ShardFormer:
     """
diff --git a/colossalai/tensor/d_tensor/layout_converter.py b/colossalai/tensor/d_tensor/layout_converter.py
index c2cf73181345..0f0150d90e7a 100644
--- a/colossalai/tensor/d_tensor/layout_converter.py
+++ b/colossalai/tensor/d_tensor/layout_converter.py
@@ -473,7 +473,7 @@ def _group_alive_check(cached_comm_action_sequence):
                 for process_group in used_process_groups:
                     try:
                         dist.get_rank(process_group)
-                    except RuntimeError as e:
+                    except (ValueError, RuntimeError) as e:
                         # If the group is not registered, it means it has been deleted
                         if str(e) == (
                             f"Group {process_group} is not registered, please create group with torch.distributed.new_group API"
diff --git a/colossalai/tensor/d_tensor/sharding_spec.py b/colossalai/tensor/d_tensor/sharding_spec.py
index 16a4f248bfc2..76d85a112aac 100644
--- a/colossalai/tensor/d_tensor/sharding_spec.py
+++ b/colossalai/tensor/d_tensor/sharding_spec.py
@@ -1,4 +1,3 @@
-from copy import deepcopy
 from typing import Dict, List
 
 from ..utils import merge_same_dim_mesh_list
@@ -23,10 +22,11 @@ class DimSpec:
             Otherwise, the element in shard_list means the data will be sharded in that dimension.
     """
 
+    _DIFFERENCE_DICT = None
+
     def __init__(self, shard_list):
         self.is_replica = len(shard_list) == 0
         self.shard_list = shard_list
-        self.build_difference_2d_dict()
 
     def __eq__(self, other):
         return str(self) == str(other)
@@ -39,24 +39,43 @@ def __repr__(self):
             target += str(dim)
         return target
 
-    def _convert_str_to_shard_list(self, str_spec):
+    @property
+    def difference_dict(self):
         """
-        Convert str_spec into shard_list.
+        Returns the difference dict, and lazily initializes it when needed
 
-        Argument:
-            str_spec(str): dim spec in str type.
+        Return:
+            difference_dict(Dict[Tuple[int, int], Union[int, float, str]]):
+                difference dict
         """
+        if self._DIFFERENCE_DICT is None:
+            self._DIFFERENCE_DICT = self._build_difference_2d_dict()
 
-        if str_spec == "R":
-            return []
-        if str_spec == "S0":
-            return [0]
-        if str_spec == "S1":
-            return [1]
-        if str_spec == "S01":
-            return [0, 1]
+        return self._DIFFERENCE_DICT
 
-    def build_difference_2d_dict(self):
+    def dim_diff(self, other):
+        """
+        The difference between two DimSpec.
+
+        Argument:
+            other(DimSpec): the dim spec to compare with.
+
+        Return:
+            difference(int): the difference between two DimSpec.
+
+        Example:
+            dim_spec = DimSpec([0])
+            other_dim_spec = DimSpec([0, 1])
+            print(dim_spec.dim_diff(other_dim_spec))
+
+        Output:
+            5
+        """
+        difference = self.difference_dict[(str(self), str(other))]
+        return difference
+
+    @classmethod
+    def _build_difference_2d_dict(cls):
         """
         Build a difference mapping for 2D device mesh case. It will be used to
         compute the difference between DimSpec pairs.
@@ -67,9 +86,8 @@ def build_difference_2d_dict(self):
         difference_dict = {}
         for source_spec in source_spec_list:
             for target_spec in target_spec_list:
-                spec_pair = (deepcopy(source_spec), deepcopy(target_spec))
-                source_shard_list = self._convert_str_to_shard_list(source_spec)
-                target_shard_list = self._convert_str_to_shard_list(target_spec)
+                source_shard_list = cls._convert_str_to_shard_list(source_spec)
+                target_shard_list = cls._convert_str_to_shard_list(target_spec)
 
                 # source same as target
                 if source_shard_list == target_shard_list:
@@ -112,30 +130,27 @@ def build_difference_2d_dict(self):
 
                 else:
                     difference = NAN
-                difference_dict[spec_pair] = difference
+                difference_dict[(source_spec, target_spec)] = difference
 
-        self.difference_dict = difference_dict
+        return difference_dict
 
-    def dim_diff(self, other):
+    @staticmethod
+    def _convert_str_to_shard_list(str_spec):
         """
-        The difference between two _DimSpec.
+        Convert str_spec into shard_list.
 
         Argument:
-            other(_DimSpec): the dim spec to compare with.
-
-        Return:
-            difference(int): the difference between two _DimSpec.
-
-        Example:
-            dim_spec = _DimSpec([0])
-            other_dim_spec = _DimSpec([0, 1])
-            print(dim_spec.difference(other_dim_spec))
-
-        Output:
-            5
+            str_spec(str): dim spec in str type.
         """
-        difference = self.difference_dict[(str(self), str(other))]
-        return difference
+
+        if str_spec == "R":
+            return []
+        if str_spec == "S0":
+            return [0]
+        if str_spec == "S1":
+            return [1]
+        if str_spec == "S01":
+            return [0, 1]
 
 
 class ShardingSpec:
diff --git a/colossalai/tensor/sharding_spec.py b/colossalai/tensor/sharding_spec.py
index b78ef6d97dd4..fb42afab75b9 100644
--- a/colossalai/tensor/sharding_spec.py
+++ b/colossalai/tensor/sharding_spec.py
@@ -1,5 +1,4 @@
 import operator
-from copy import deepcopy
 from functools import reduce
 
 import torch
@@ -27,10 +26,11 @@ class _DimSpec:
             Otherwise, the element in shard_list means the data will be sharded in that dimension.
     """
 
+    _DIFFERENCE_DICT = None
+
     def __init__(self, shard_list):
         self.is_replica = len(shard_list) == 0
         self.shard_list = shard_list
-        self.build_difference_2d_dict()
 
     def __eq__(self, other):
         return str(self) == str(other)
@@ -43,27 +43,46 @@ def __repr__(self):
             target += str(dim)
         return target
 
-    def _convert_str_to_shard_list(self, str_spec):
+    @property
+    def difference_dict(self):
         """
-        Convert str_spec into shard_list.
+        Returns the difference dict, and lazily initializes it when needed
 
-        Argument:
-            str_spec(str): dim spec in str type.
+        Return:
+            difference_dict(Dict[Tuple[int, int], Union[int, float, str]]):
+                difference dict
         """
+        if self._DIFFERENCE_DICT is None:
+            self._DIFFERENCE_DICT = self._build_difference_2d_dict()
 
-        if str_spec == "R":
-            return []
-        if str_spec == "S0":
-            return [0]
-        if str_spec == "S1":
-            return [1]
-        if str_spec == "S01":
-            return [0, 1]
+        return self._DIFFERENCE_DICT
 
-    def build_difference_2d_dict(self):
+    def difference(self, other):
+        """
+        The difference between two _DimSpec.
+
+        Argument:
+            other(_DimSpec): the dim spec to compare with.
+
+        Return:
+            difference(int): the difference between two _DimSpec.
+
+        Example:
+            dim_spec = _DimSpec([0])
+            other_dim_spec = _DimSpec([0, 1])
+            print(dim_spec.difference(other_dim_spec))
+
+        Output:
+            5
+        """
+        difference = self.difference_dict[(str(self), str(other))]
+        return difference
+
+    @classmethod
+    def _build_difference_2d_dict(cls):
         """
         Build a difference mapping for 2D device mesh case. It will be used to
-        compute the difference between DimSpec pairs.
+        compute the difference between _DimSpec pairs.
         """
 
         source_spec_list = ["R", "S0", "S1", "S01"]
@@ -71,9 +90,8 @@ def build_difference_2d_dict(self):
         difference_dict = {}
         for source_spec in source_spec_list:
             for target_spec in target_spec_list:
-                spec_pair = (deepcopy(source_spec), deepcopy(target_spec))
-                source_shard_list = self._convert_str_to_shard_list(source_spec)
-                target_shard_list = self._convert_str_to_shard_list(target_spec)
+                source_shard_list = cls._convert_str_to_shard_list(source_spec)
+                target_shard_list = cls._convert_str_to_shard_list(target_spec)
 
                 # source same as target
                 if source_shard_list == target_shard_list:
@@ -116,30 +134,27 @@ def build_difference_2d_dict(self):
 
                 else:
                     difference = NAN
-                difference_dict[spec_pair] = difference
+                difference_dict[(source_spec, target_spec)] = difference
 
-        self.difference_dict = difference_dict
+        return difference_dict
 
-    def difference(self, other):
+    @staticmethod
+    def _convert_str_to_shard_list(str_spec):
         """
-        The difference between two _DimSpec.
+        Convert str_spec into shard_list.
 
         Argument:
-            other(_DimSpec): the dim spec to compare with.
-
-        Return:
-            difference(int): the difference between two _DimSpec.
-
-        Example:
-            dim_spec = _DimSpec([0])
-            other_dim_spec = _DimSpec([0, 1])
-            print(dim_spec.difference(other_dim_spec))
-
-        Output:
-            5
+            str_spec(str): dim spec in str type.
         """
-        difference = self.difference_dict[(str(self), str(other))]
-        return difference
+
+        if str_spec == "R":
+            return []
+        if str_spec == "S0":
+            return [0]
+        if str_spec == "S1":
+            return [1]
+        if str_spec == "S01":
+            return [0, 1]
 
 
 class ShardingSpecException(Exception):
diff --git a/colossalai/zero/low_level/bookkeeping/gradient_store.py b/colossalai/zero/low_level/bookkeeping/gradient_store.py
index e24a67f9de3c..8b6d403f1327 100644
--- a/colossalai/zero/low_level/bookkeeping/gradient_store.py
+++ b/colossalai/zero/low_level/bookkeeping/gradient_store.py
@@ -19,7 +19,6 @@ def __init__(self, *args, partition_grad: bool = False):
         """
         self._grads_of_params = dict()
         # stage 2
-        self._partition_grads = partition_grad
         self._working_index = 0 if partition_grad else self._local_rank
         # for zero2, it's `param_id: [grad_local_rank]`
         self.grad_to_param_mapping = dict()
@@ -91,7 +90,7 @@ def get_working_grads_by_group_id(self, group_id: int) -> List:
 
         return grad_list
 
-    def get_working_grad_by_param_id(self, param_id) -> Tensor:
+    def get_working_grad_by_param_id(self, param_id) -> Optional[Tensor]:
         """
         Return the working gradient for the specified parameter.
 
@@ -112,6 +111,7 @@ def reset_grads_by_group_id(self, group_id: int):
 
     def reset_all_gradients(self):
         self._grads_of_params = dict()
+        self.grad_to_param_mapping = dict()
 
     def get_param_id_for_grad(self, grad: Tensor) -> Optional[int]:
         """Return the id of a parameter which the gradient slice belongs to
diff --git a/colossalai/zero/low_level/low_level_optim.py b/colossalai/zero/low_level/low_level_optim.py
index 1353071c59df..458e6e41a29e 100644
--- a/colossalai/zero/low_level/low_level_optim.py
+++ b/colossalai/zero/low_level/low_level_optim.py
@@ -21,9 +21,11 @@
 from colossalai.interface import OptimizerWrapper
 from colossalai.logging import get_dist_logger
 from colossalai.quantization.fp8 import all_gather_into_tensor_flat_fp8, all_reduce_fp8, reduce_scatter_fp8
+from colossalai.tensor.moe_tensor.api import is_moe_tensor
 
 from ._utils import calculate_global_norm_from_list, has_inf_or_nan, release_param_grad, sync_tensor
 from .bookkeeping import BucketStore, GradientStore, TensorBucket
+from .zero_hook import set_all_gather_handle, wait_all_gather_handle
 
 
 class LowLevelZeroFP16MixedPrecisionMixin(FP16MixedPrecisionMixin):
@@ -66,7 +68,7 @@ class LowLevelZeroOptimizer(OptimizerWrapper):
     def __init__(
         self,
         optimizer: Optimizer,
-        pg_to_param_list: Dict[ProcessGroup, List[nn.Parameter]] = None,
+        pg_to_param_list: Optional[Dict[ProcessGroup, List[nn.Parameter]]] = None,
         initial_scale: int = 2**16,  # grad scaler config
         min_scale: int = 1,
         growth_factor: float = 2.0,
@@ -84,6 +86,7 @@ def __init__(
         dp_process_group: Optional[ProcessGroup] = None,
         forced_dtype: Optional[torch.dtype] = None,
         master_weights: bool = True,  # master weights
+        overlap_allgather: bool = False,
         fp8_communication: bool = False,
     ):
         super(LowLevelZeroOptimizer, self).__init__(optim=optimizer)
@@ -92,7 +95,7 @@ def __init__(
         self._logger = get_dist_logger()
         self._verbose = verbose
 
-        if dp_process_group is not None and pg_to_param_list is not None:
+        if (dp_process_group is not None) and (pg_to_param_list is not None):
             raise ValueError("dp_process_group and pg_to_param_list should not be provided at the same time.")
 
         if pg_to_param_list is None:
@@ -123,6 +126,7 @@ def __init__(
 
         # communication params
         self._overlap_communication = overlap_communication
+        self._overlap_allgather = overlap_allgather
         self._reduce_bucket_size = reduce_bucket_size
         self._communication_dtype = communication_dtype
         self._fp8_communication = fp8_communication
@@ -148,6 +152,8 @@ def __init__(
 
         # record the padding size of each param
         self._padding_map = dict()
+        # padded working param is all-gather buffer and it shares the same memory with working param
+        self._working_param_to_padded_working_param = dict()
 
         # mapping working param and master param
         self.master_to_working_param = dict()
@@ -248,11 +254,12 @@ def _create_master_param_current_rank(self, param_list):
             with torch.no_grad():
                 if padding_size > 0:
                     padding_param = torch.nn.functional.pad(param.data.view(-1), [0, padding_size])
-                    # reset working params' ptr when no master weights
-                    if self._master_weights == False:
-                        param.data = padding_param[: param.numel()].view(param.shape)
+                    # # reset working params' ptr when no master weights
+                    # if self._master_weights == False:
+                    param.data = padding_param[: param.numel()].view(param.shape)
                 else:
                     padding_param = param.data.view(-1)
+                self._working_param_to_padded_working_param[param] = padding_param
 
                 splited_params = padding_param.split(
                     padding_param.numel() // self.pid_to_bucket_store[id(param)].world_size
@@ -261,7 +268,7 @@ def _create_master_param_current_rank(self, param_list):
 
                 # use fp32 when master_weights is True
                 if self._master_weights is True:
-                    splited_param_current_rank = splited_params.detach().float().to(device)
+                    splited_param_current_rank = splited_params.detach().clone().float().to(device)
                 else:
                     splited_param_current_rank = splited_params
 
@@ -338,21 +345,21 @@ def _run_reduction(self):
                     self._update_unpartitoned_grad(bucket_store, grad_in_bucket.values(), flat_grads_per_rank, group_id)
                 else:
                     flat_grads_list = list(flat_grads.split(len(flat_grads) // bucket_store.world_size))
-                    recieved_grad = torch.zeros_like(flat_grads_list[0])
+                    received_grad = torch.zeros_like(flat_grads_list[0])
                     if self._fp8_communication:
                         reduce_scatter_fp8(
-                            recieved_grad,
+                            received_grad,
                             flat_grads_list,
                             group=bucket_store.torch_pg,
                         )
                     else:
-                        dist.reduce_scatter(recieved_grad, flat_grads_list, group=bucket_store.torch_pg)
+                        dist.reduce_scatter(received_grad, flat_grads_list, group=bucket_store.torch_pg)
 
-                    if recieved_grad.dtype != grad_dtype:
-                        recieved_grad = recieved_grad.to(grad_dtype)
+                    if received_grad.dtype != grad_dtype:
+                        received_grad = received_grad.to(grad_dtype)
 
                     grad_in_bucket_current_rank = bucket_store.get_grad()[bucket_store.local_rank]
-                    self._update_partitoned_grad(bucket_store, grad_in_bucket_current_rank, recieved_grad, group_id, 1)
+                    self._update_partitoned_grad(bucket_store, grad_in_bucket_current_rank, received_grad, group_id, 1)
 
                 bucket_store.reset()
 
@@ -562,25 +569,29 @@ def step(self, closure=None):
                 working_param = real_working_params[group_id][idx]
                 param_to_gather = master_param.to(device).to(self._dtype)
                 pg = self.param_to_pg[working_param]
-                if param_to_gather.numel() > self.pg_to_tensor_bucket[pg].max_size:
-                    buffer_tensor = torch.empty_like(
-                        torch.cat([param_to_gather for _ in range(dist.get_world_size(pg))])
-                    )
-                    if self._fp8_communication:
-                        all_gather_into_tensor_flat_fp8(buffer_tensor, param_to_gather, pg, fp8_format="e4m3")
-                    else:
-                        dist.all_gather_into_tensor(buffer_tensor, param_to_gather, pg)
-                    working_param.data.copy_(buffer_tensor[: working_param.numel()].reshape_as(working_param))
-                    continue
-                try:
-                    self.pg_to_tensor_bucket[pg].add_to_bucket(param_to_gather, write_back_tensor=working_param)
-                except RuntimeError:
-                    self.pg_to_tensor_bucket[pg].all_gather(pg, fp8_communication=self._fp8_communication)
-                    self.pg_to_tensor_bucket[pg].add_to_bucket(param_to_gather, write_back_tensor=working_param)
+                padded_working_param = self._working_param_to_padded_working_param[working_param]
+                if self._overlap_allgather:
+                    handle = dist.all_gather_into_tensor(padded_working_param, param_to_gather, pg, async_op=True)
+                    set_all_gather_handle(working_param, handle)
+                else:
+                    if param_to_gather.numel() > self.pg_to_tensor_bucket[pg].max_size:
+                        if self._fp8_communication:
+                            all_gather_into_tensor_flat_fp8(
+                                padded_working_param, param_to_gather, pg, fp8_format="e4m3"
+                            )
+                        else:
+                            dist.all_gather_into_tensor(padded_working_param, param_to_gather, pg)
+                        continue
+                    try:
+                        self.pg_to_tensor_bucket[pg].add_to_bucket(param_to_gather, write_back_tensor=working_param)
+                    except RuntimeError:
+                        self.pg_to_tensor_bucket[pg].all_gather(pg, fp8_communication=self._fp8_communication)
+                        self.pg_to_tensor_bucket[pg].add_to_bucket(param_to_gather, write_back_tensor=working_param)
             self.optim.param_groups[group_id]["params"] = self._master_param_groups_of_current_rank[group_id]
-        for pg, tensor_bucket in self.pg_to_tensor_bucket.items():
-            if not tensor_bucket.is_empty():
-                tensor_bucket.all_gather(pg, fp8_communication=self._fp8_communication)
+        if not self._overlap_allgather:
+            for pg, tensor_bucket in self.pg_to_tensor_bucket.items():
+                if not tensor_bucket.is_empty():
+                    tensor_bucket.all_gather(pg, fp8_communication=self._fp8_communication)
 
     def _compute_grad_norm(self, dp_pg: ProcessGroup, gradients: List[Tensor], norm_type: int = 2) -> float:
         r"""
@@ -657,6 +668,11 @@ def _sync_grad(self):
         for group_id in range(self.num_param_groups):
             param_group = self._working_param_groups[group_id]
             for param in param_group:
+                if is_moe_tensor(param) and param.requires_grad and param.grad is None:
+                    # TODO better of of doing this
+                    # assign zero grad to unrouted expert to avoid hang during grad reduction
+                    param.grad = torch.zeros_like(param)
+
                 if param.requires_grad and param.grad is not None:
                     self._add_to_bucket(param, group_id)
 
@@ -815,8 +831,8 @@ def update_master_params(self, model: nn.Module) -> None:
         """
         for p in model.parameters():
             p_id = id(p)
-            pg = self.param_to_pg[p]
             if p_id in self.working_to_master_param:
+                pg = self.param_to_pg[p]
                 master_param = self.working_to_master_param[p_id]
                 padding_size = self.get_param_padding_size(p)
                 working_param = p.data.view(-1)
@@ -877,13 +893,12 @@ def get_padding_map(self) -> Dict[int, Tensor]:
 
     def get_param_grad(self, working_param: nn.Parameter) -> Tensor:
         grad_store = self.pid_to_grad_store[id(working_param)]
-        partial_grad = grad_store.get_working_grad_by_param_id(id(working_param))
-        if partial_grad is None:
+        grad = grad_store.get_working_grad_by_param_id(id(working_param))
+        if grad is None:
             return None
-        tensor_list = [torch.empty_like(partial_grad) for _ in range(grad_store.world_size)]
-        dist.all_gather(tensor_list, partial_grad, group=grad_store.torch_pg)
-        grad_flat = torch.cat(tensor_list, dim=0)
-        return grad_flat[: working_param.numel()].reshape_as(working_param)
+        grad_flat = torch.empty((grad_store.world_size, *grad.shape), dtype=grad.dtype, device=grad.device)
+        dist.all_gather_into_tensor(grad_flat, grad, group=grad_store.torch_pg)
+        return grad_flat.view(-1)[: working_param.numel()].view_as(working_param)
 
     def get_working_grads_by_group_id(self, group_id: int) -> List[Tensor]:
         working_grads = []
@@ -908,3 +923,7 @@ def get_working_grad_by_param_id(self, param_id: int) -> Tensor:
     def get_partitioned_gradients_by_param_id(self, group_id: int, param_id: int) -> List:
         grad_store = self.pid_to_grad_store[param_id]
         return grad_store.get_partitioned_gradients_by_param_id(group_id, param_id)
+
+    def _force_wait_all_gather(self):
+        for param in self._working_param_to_padded_working_param.keys():
+            wait_all_gather_handle(param)
diff --git a/colossalai/zero/low_level/zero_hook.py b/colossalai/zero/low_level/zero_hook.py
new file mode 100644
index 000000000000..20f9ef31aae0
--- /dev/null
+++ b/colossalai/zero/low_level/zero_hook.py
@@ -0,0 +1,33 @@
+from typing import List
+
+from torch._tensor import Tensor
+
+from colossalai.tensor.param_op_hook import ColoParamOpHook
+
+_ALL_GATHER_HANDLE = "_all_gather_handle"
+
+
+def wait_all_gather_handle(p):
+    if hasattr(p, _ALL_GATHER_HANDLE):
+        handle = getattr(p, _ALL_GATHER_HANDLE)
+        handle.wait()
+        delattr(p, _ALL_GATHER_HANDLE)
+
+
+def set_all_gather_handle(p, handle):
+    setattr(p, _ALL_GATHER_HANDLE, handle)
+
+
+class ZeroOpHook(ColoParamOpHook):
+    def pre_forward(self, params: List[Tensor]) -> None:
+        for p in params:
+            wait_all_gather_handle(p)
+
+    def post_forward(self, params: List[Tensor]) -> None:
+        pass
+
+    def pre_backward(self, params: List[Tensor]) -> None:
+        pass
+
+    def post_backward(self, params: List[Tensor]) -> None:
+        pass
diff --git a/examples/inference/stable_diffusion/README.md b/examples/inference/stable_diffusion/README.md
new file mode 100644
index 000000000000..c11b9804392c
--- /dev/null
+++ b/examples/inference/stable_diffusion/README.md
@@ -0,0 +1,22 @@
+## File Structure
+```
+|- sd3_generation.py: an example of how to use Colossalai Inference Engine to generate result by loading Diffusion Model.
+|- compute_metric.py: compare the quality of images w/o some acceleration method like Distrifusion
+|- benchmark_sd3.py: benchmark the performance of our InferenceEngine
+|- run_benchmark.sh: run benchmark command
+```
+Note: compute_metric.py need some dependencies which need `pip install -r requirements.txt`, `requirements.txt` is in `examples/inference/stable_diffusion/`
+
+## Run Inference
+
+The provided example `sd3_generation.py` is an example to configure, initialize the engine, and run inference on provided model. We've added `DiffusionPipeline` as model class, and the script is good to run inference with StableDiffusion 3.
+
+For a basic setting, you could run the example by:
+```bash
+colossalai run --nproc_per_node 1 sd3_generation.py -m PATH_MODEL -p "hello world"
+```
+
+Run multi-GPU inference (Patched Parallelism), as in the following example using 2 GPUs:
+```bash
+colossalai run --nproc_per_node 2 sd3_generation.py -m PATH_MODEL
+```
diff --git a/examples/inference/stable_diffusion/benchmark_sd3.py b/examples/inference/stable_diffusion/benchmark_sd3.py
new file mode 100644
index 000000000000..19db57c33c82
--- /dev/null
+++ b/examples/inference/stable_diffusion/benchmark_sd3.py
@@ -0,0 +1,179 @@
+import argparse
+import json
+import time
+from contextlib import nullcontext
+
+import torch
+import torch.distributed as dist
+from diffusers import DiffusionPipeline
+
+import colossalai
+from colossalai.inference.config import DiffusionGenerationConfig, InferenceConfig
+from colossalai.inference.core.engine import InferenceEngine
+from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
+
+GIGABYTE = 1024**3
+MEGABYTE = 1024 * 1024
+
+_DTYPE_MAPPING = {
+    "fp16": torch.float16,
+    "bf16": torch.bfloat16,
+    "fp32": torch.float32,
+}
+
+
+def log_generation_time(log_data, log_file):
+    with open(log_file, "a") as f:
+        json.dump(log_data, f, indent=2)
+        f.write("\n")
+
+
+def warmup(engine, args):
+    for _ in range(args.n_warm_up_steps):
+        engine.generate(
+            prompts=["hello world"],
+            generation_config=DiffusionGenerationConfig(
+                num_inference_steps=args.num_inference_steps, height=args.height[0], width=args.width[0]
+            ),
+        )
+
+
+def profile_context(args):
+    return (
+        torch.profiler.profile(
+            record_shapes=True,
+            with_stack=True,
+            with_modules=True,
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ],
+        )
+        if args.profile
+        else nullcontext()
+    )
+
+
+def log_and_profile(h, w, avg_time, log_msg, args, model_name, mode, prof=None):
+    log_data = {
+        "mode": mode,
+        "model": model_name,
+        "batch_size": args.batch_size,
+        "patched_parallel_size": args.patched_parallel_size,
+        "num_inference_steps": args.num_inference_steps,
+        "height": h,
+        "width": w,
+        "dtype": args.dtype,
+        "profile": args.profile,
+        "n_warm_up_steps": args.n_warm_up_steps,
+        "n_repeat_times": args.n_repeat_times,
+        "avg_generation_time": avg_time,
+        "log_message": log_msg,
+    }
+
+    if args.log:
+        log_file = f"examples/inference/stable_diffusion/benchmark_{model_name}_{mode}.json"
+        log_generation_time(log_data=log_data, log_file=log_file)
+
+    if args.profile:
+        file = f"examples/inference/stable_diffusion/benchmark_{model_name}_{mode}_prof.json"
+        prof.export_chrome_trace(file)
+
+
+def benchmark_colossalai(rank, world_size, port, args):
+    colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
+    from colossalai.cluster.dist_coordinator import DistCoordinator
+
+    coordinator = DistCoordinator()
+
+    inference_config = InferenceConfig(
+        dtype=args.dtype,
+        patched_parallelism_size=args.patched_parallel_size,
+    )
+    engine = InferenceEngine(args.model, inference_config=inference_config, verbose=False)
+
+    warmup(engine, args)
+
+    for h, w in zip(args.height, args.width):
+        with profile_context(args) as prof:
+            start = time.perf_counter()
+            for _ in range(args.n_repeat_times):
+                engine.generate(
+                    prompts=["hello world"],
+                    generation_config=DiffusionGenerationConfig(
+                        num_inference_steps=args.num_inference_steps, height=h, width=w
+                    ),
+                )
+            end = time.perf_counter()
+
+        avg_time = (end - start) / args.n_repeat_times
+        log_msg = f"[ColossalAI]avg generation time for h({h})xw({w}) is {avg_time:.2f}s"
+        coordinator.print_on_master(log_msg)
+
+        if dist.get_rank() == 0:
+            log_and_profile(h, w, avg_time, log_msg, args, args.model.split("/")[-1], "colossalai", prof=prof)
+
+
+def benchmark_diffusers(args):
+    model = DiffusionPipeline.from_pretrained(args.model, torch_dtype=_DTYPE_MAPPING[args.dtype]).to("cuda")
+
+    for _ in range(args.n_warm_up_steps):
+        model(
+            prompt="hello world",
+            num_inference_steps=args.num_inference_steps,
+            height=args.height[0],
+            width=args.width[0],
+        )
+
+    for h, w in zip(args.height, args.width):
+        with profile_context(args) as prof:
+            start = time.perf_counter()
+            for _ in range(args.n_repeat_times):
+                model(prompt="hello world", num_inference_steps=args.num_inference_steps, height=h, width=w)
+            end = time.perf_counter()
+
+        avg_time = (end - start) / args.n_repeat_times
+        log_msg = f"[Diffusers]avg generation time for h({h})xw({w}) is {avg_time:.2f}s"
+        print(log_msg)
+
+        log_and_profile(h, w, avg_time, log_msg, args, args.model.split("/")[-1], "diffusers", prof)
+
+
+@rerun_if_address_is_in_use()
+@clear_cache_before_run()
+def benchmark(args):
+    if args.mode == "colossalai":
+        spawn(benchmark_colossalai, nprocs=args.patched_parallel_size, args=args)
+    elif args.mode == "diffusers":
+        benchmark_diffusers(args)
+
+
+"""
+# enable log
+python examples/inference/stable_diffusion/benchmark_sd3.py -m "PixArt-alpha/PixArt-XL-2-1024-MS" -p 2 --mode colossalai --log
+python examples/inference/stable_diffusion/benchmark_sd3.py -m "PixArt-alpha/PixArt-XL-2-1024-MS" --mode diffusers --log
+
+# enable profiler
+python examples/inference/stable_diffusion/benchmark_sd3.py -m "stabilityai/stable-diffusion-3-medium-diffusers" -p 2 --mode colossalai --n_warm_up_steps 3 --n_repeat_times 1 --profile --num_inference_steps 20
+python examples/inference/stable_diffusion/benchmark_sd3.py -m "PixArt-alpha/PixArt-XL-2-1024-MS" -p 2 --mode colossalai --n_warm_up_steps 3 --n_repeat_times 1 --profile --num_inference_steps 20
+python examples/inference/stable_diffusion/benchmark_sd3.py -m "PixArt-alpha/PixArt-XL-2-1024-MS" --mode diffusers --n_warm_up_steps 3 --n_repeat_times 1 --profile --num_inference_steps 20
+"""
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-b", "--batch_size", type=int, default=1, help="Batch size")
+    parser.add_argument("-p", "--patched_parallel_size", type=int, default=1, help="Patched Parallelism size")
+    parser.add_argument("-n", "--num_inference_steps", type=int, default=50, help="Number of inference steps")
+    parser.add_argument("-H", "--height", type=int, nargs="+", default=[1024, 2048], help="Height list")
+    parser.add_argument("-w", "--width", type=int, nargs="+", default=[1024, 2048], help="Width list")
+    parser.add_argument("--dtype", type=str, default="fp16", choices=["fp16", "fp32", "bf16"], help="Data type")
+    parser.add_argument("--n_warm_up_steps", type=int, default=3, help="Number of warm up steps")
+    parser.add_argument("--n_repeat_times", type=int, default=5, help="Number of repeat times")
+    parser.add_argument("--profile", default=False, action="store_true", help="Enable torch profiler")
+    parser.add_argument("--log", default=False, action="store_true", help="Enable logging")
+    parser.add_argument("-m", "--model", default="stabilityai/stable-diffusion-3-medium-diffusers", help="Model path")
+    parser.add_argument(
+        "--mode", default="colossalai", choices=["colossalai", "diffusers"], help="Inference framework mode"
+    )
+    args = parser.parse_args()
+    benchmark(args)
diff --git a/examples/inference/stable_diffusion/compute_metric.py b/examples/inference/stable_diffusion/compute_metric.py
new file mode 100644
index 000000000000..14c92501b66d
--- /dev/null
+++ b/examples/inference/stable_diffusion/compute_metric.py
@@ -0,0 +1,80 @@
+# Code from https://github.com/mit-han-lab/distrifuser/blob/main/scripts/compute_metrics.py
+import argparse
+import os
+
+import numpy as np
+import torch
+from cleanfid import fid
+from PIL import Image
+from torch.utils.data import DataLoader, Dataset
+from torchmetrics.image import LearnedPerceptualImagePatchSimilarity, PeakSignalNoiseRatio
+from torchvision.transforms import Resize
+from tqdm import tqdm
+
+
+def read_image(path: str):
+    """
+    input: path
+    output: tensor (C, H, W)
+    """
+    img = np.asarray(Image.open(path))
+    if len(img.shape) == 2:
+        img = np.repeat(img[:, :, None], 3, axis=2)
+    img = torch.from_numpy(img).permute(2, 0, 1)
+    return img
+
+
+class MultiImageDataset(Dataset):
+    def __init__(self, root0, root1, is_gt=False):
+        super().__init__()
+        self.root0 = root0
+        self.root1 = root1
+        file_names0 = os.listdir(root0)
+        file_names1 = os.listdir(root1)
+
+        self.image_names0 = sorted([name for name in file_names0 if name.endswith(".png") or name.endswith(".jpg")])
+        self.image_names1 = sorted([name for name in file_names1 if name.endswith(".png") or name.endswith(".jpg")])
+        self.is_gt = is_gt
+        assert len(self.image_names0) == len(self.image_names1)
+
+    def __len__(self):
+        return len(self.image_names0)
+
+    def __getitem__(self, idx):
+        img0 = read_image(os.path.join(self.root0, self.image_names0[idx]))
+        if self.is_gt:
+            # resize to 1024 x 1024
+            img0 = Resize((1024, 1024))(img0)
+        img1 = read_image(os.path.join(self.root1, self.image_names1[idx]))
+
+        batch_list = [img0, img1]
+        return batch_list
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch_size", type=int, default=64)
+    parser.add_argument("--num_workers", type=int, default=8)
+    parser.add_argument("--is_gt", action="store_true")
+    parser.add_argument("--input_root0", type=str, required=True)
+    parser.add_argument("--input_root1", type=str, required=True)
+    args = parser.parse_args()
+
+    psnr = PeakSignalNoiseRatio(data_range=(0, 1), reduction="elementwise_mean", dim=(1, 2, 3)).to("cuda")
+    lpips = LearnedPerceptualImagePatchSimilarity(normalize=True).to("cuda")
+
+    dataset = MultiImageDataset(args.input_root0, args.input_root1, is_gt=args.is_gt)
+    dataloader = DataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_workers)
+
+    progress_bar = tqdm(dataloader)
+    with torch.inference_mode():
+        for i, batch in enumerate(progress_bar):
+            batch = [img.to("cuda") / 255 for img in batch]
+            batch_size = batch[0].shape[0]
+            psnr.update(batch[0], batch[1])
+            lpips.update(batch[0], batch[1])
+    fid_score = fid.compute_fid(args.input_root0, args.input_root1)
+
+    print("PSNR:", psnr.compute().item())
+    print("LPIPS:", lpips.compute().item())
+    print("FID:", fid_score)
diff --git a/examples/inference/stable_diffusion/requirements.txt b/examples/inference/stable_diffusion/requirements.txt
new file mode 100644
index 000000000000..c4e74162dfb5
--- /dev/null
+++ b/examples/inference/stable_diffusion/requirements.txt
@@ -0,0 +1,3 @@
+torchvision
+torchmetrics
+cleanfid
diff --git a/examples/inference/stable_diffusion/run_benchmark.sh b/examples/inference/stable_diffusion/run_benchmark.sh
new file mode 100644
index 000000000000..f3e45a335219
--- /dev/null
+++ b/examples/inference/stable_diffusion/run_benchmark.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+models=("PixArt-alpha/PixArt-XL-2-1024-MS" "stabilityai/stable-diffusion-3-medium-diffusers")
+parallelism=(1 2 4 8)
+resolutions=(1024 2048 3840)
+modes=("colossalai" "diffusers")
+
+CUDA_VISIBLE_DEVICES_set_n_least_memory_usage() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv \
+        | tail -n +2 \
+        | nl -v 0 \
+        | tee /dev/tty \
+        | sort -g -k 2 \
+        | awk '{print $1}' \
+        | head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
+
+for model in "${models[@]}"; do
+    for p in "${parallelism[@]}"; do
+        for resolution in "${resolutions[@]}"; do
+            for mode in "${modes[@]}"; do
+                if [[ "$mode" == "colossalai" && "$p" == 1 ]]; then
+                    continue
+                fi
+                if [[ "$mode" == "diffusers" && "$p" != 1 ]]; then
+                    continue
+                fi
+                CUDA_VISIBLE_DEVICES_set_n_least_memory_usage $p
+
+                cmd="python examples/inference/stable_diffusion/benchmark_sd3.py -m \"$model\" -p $p --mode $mode --log -H $resolution -w $resolution"
+
+                echo "Executing: $cmd"
+                eval $cmd
+            done
+        done
+    done
+done
diff --git a/examples/inference/stable_diffusion/sd3_generation.py b/examples/inference/stable_diffusion/sd3_generation.py
new file mode 100644
index 000000000000..9e146c34b937
--- /dev/null
+++ b/examples/inference/stable_diffusion/sd3_generation.py
@@ -0,0 +1,81 @@
+import argparse
+
+from diffusers import DiffusionPipeline
+from torch import bfloat16
+from torch import distributed as dist
+from torch import float16, float32
+
+import colossalai
+from colossalai.cluster import DistCoordinator
+from colossalai.inference.config import DiffusionGenerationConfig, InferenceConfig
+from colossalai.inference.core.engine import InferenceEngine
+
+# For Stable Diffusion 3, we'll use the following configuration
+MODEL_CLS = DiffusionPipeline
+
+TORCH_DTYPE_MAP = {
+    "fp16": float16,
+    "fp32": float32,
+    "bf16": bfloat16,
+}
+
+
+def infer(args):
+    # ==============================
+    # Launch colossalai, setup distributed environment
+    # ==============================
+    colossalai.launch_from_torch()
+    coordinator = DistCoordinator()
+
+    # ==============================
+    # Load model and tokenizer
+    # ==============================
+    model_path_or_name = args.model
+    model = MODEL_CLS.from_pretrained(model_path_or_name, torch_dtype=TORCH_DTYPE_MAP.get(args.dtype, None))
+
+    # ==============================
+    # Initialize InferenceEngine
+    # ==============================
+    coordinator.print_on_master(f"Initializing Inference Engine...")
+    inference_config = InferenceConfig(
+        dtype=args.dtype,
+        max_batch_size=args.max_batch_size,
+        tp_size=args.tp_size,
+        use_cuda_kernel=args.use_cuda_kernel,
+        patched_parallelism_size=dist.get_world_size(),
+    )
+    engine = InferenceEngine(model, inference_config=inference_config, verbose=True)
+
+    # ==============================
+    # Generation
+    # ==============================
+    coordinator.print_on_master(f"Generating...")
+    out = engine.generate(prompts=[args.prompt], generation_config=DiffusionGenerationConfig())[0]
+    if dist.get_rank() == 0:
+        out.save(f"cat_parallel_size{dist.get_world_size()}.jpg")
+    coordinator.print_on_master(out)
+
+
+# colossalai run --nproc_per_node 1 examples/inference/stable_diffusion/sd3_generation.py -m MODEL_PATH
+
+# colossalai run --nproc_per_node 1 examples/inference/stable_diffusion/sd3_generation.py -m "stabilityai/stable-diffusion-3-medium-diffusers" --tp_size 1
+# colossalai run --nproc_per_node 2 examples/inference/stable_diffusion/sd3_generation.py -m "stabilityai/stable-diffusion-3-medium-diffusers" --tp_size 1
+
+# colossalai run --nproc_per_node 1 examples/inference/stable_diffusion/sd3_generation.py -m "PixArt-alpha/PixArt-XL-2-1024-MS" --tp_size 1
+# colossalai run --nproc_per_node 2 examples/inference/stable_diffusion/sd3_generation.py -m "PixArt-alpha/PixArt-XL-2-1024-MS" --tp_size 1
+
+
+if __name__ == "__main__":
+    # ==============================
+    # Parse Arguments
+    # ==============================
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-m", "--model", type=str, help="Path to the model or model name")
+    parser.add_argument("-t", "--tp_size", type=int, default=1, help="Tensor Parallelism size")
+    parser.add_argument("-p", "--prompt", type=str, default="A cat holding a sign that says hello world", help="Prompt")
+    parser.add_argument("-b", "--max_batch_size", type=int, default=1, help="Max batch size")
+    parser.add_argument("-d", "--dtype", type=str, default="fp16", help="Data type", choices=["fp16", "fp32", "bf16"])
+    parser.add_argument("--use_cuda_kernel", action="store_true", help="Use CUDA kernel, use Triton by default")
+    args = parser.parse_args()
+
+    infer(args)
diff --git a/examples/inference/stable_diffusion/test_ci.sh b/examples/inference/stable_diffusion/test_ci.sh
new file mode 100644
index 000000000000..d0189431cb20
--- /dev/null
+++ b/examples/inference/stable_diffusion/test_ci.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+echo "Skip the test (this test is slow)"
diff --git a/examples/language/gpt/hybridparallelism/finetune.py b/examples/language/gpt/hybridparallelism/finetune.py
index 9b3a101609dc..f447adf693f8 100644
--- a/examples/language/gpt/hybridparallelism/finetune.py
+++ b/examples/language/gpt/hybridparallelism/finetune.py
@@ -1,4 +1,5 @@
 import argparse
+from contextlib import nullcontext
 from typing import Callable, List, Union
 
 import evaluate
@@ -17,6 +18,7 @@
 from colossalai.booster import Booster
 from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin, TorchDDPPlugin
 from colossalai.cluster import DistCoordinator
+from colossalai.lazy import LazyInitContext
 from colossalai.nn.optimizer import HybridAdam
 
 # ==============================
@@ -252,10 +254,16 @@ def main():
         pad_token_id=data_builder.tokenizer.pad_token_id,
     )
 
-    if model_name == "gpt2":
-        model = GPT2ForSequenceClassification.from_pretrained(model_name, config=cfg).cuda()
-    else:
-        raise RuntimeError
+    init_ctx = (
+        LazyInitContext(default_device=get_accelerator().get_current_device())
+        if isinstance(plugin, (GeminiPlugin))
+        else nullcontext()
+    )
+    with init_ctx:
+        if model_name == "gpt2":
+            model = GPT2ForSequenceClassification.from_pretrained(model_name, config=cfg).cuda()
+        else:
+            raise RuntimeError
 
     # optimizer
     no_decay = ["bias", "LayerNorm.weight"]
diff --git a/examples/language/llama/benchmark.py b/examples/language/llama/benchmark.py
index 8a35db1f7038..e530e2d6a153 100644
--- a/examples/language/llama/benchmark.py
+++ b/examples/language/llama/benchmark.py
@@ -98,6 +98,7 @@ def main():
     parser.add_argument("--disable-async-reduce", action="store_true", help="Disable the asynchronous reduce operation")
     parser.add_argument("--prefetch_num", type=int, default=0, help="chunk prefetch max number")
     parser.add_argument("--no_cache", action="store_true")
+    parser.add_argument("--overlap_allgather", action="store_true")
     args = parser.parse_args()
 
     colossalai.launch_from_torch()
@@ -199,9 +200,9 @@ def empty_init():
             enable_flash_attention=args.xformers,
             microbatch_size=args.mbs,
             precision="bf16",
-            dp_outside=False,
             overlap_p2p=args.overlap,
             enable_metadata_cache=not args.no_cache,
+            overlap_allgather=args.overlap_allgather,
             **hybrid_kwargs,
         )
     elif args.plugin == "3d_cpu":
@@ -292,7 +293,7 @@ def empty_init():
     with get_profile_context(
         args.profile,
         args.ignore_steps,
-        len(dataloader) - 1,
+        1,  # avoid creating massive log files
         save_dir=f"profile/{time.strftime('%H:%M', time.localtime())}-{args.plugin}-llama-{args.config}",
     ) as prof:
         if isinstance(plugin, HybridParallelPlugin) and args.pp > 1:
diff --git a/examples/language/opt/opt_benchmark.py b/examples/language/opt/opt_benchmark.py
index c2883d96c16e..ca9b63d1a14a 100755
--- a/examples/language/opt/opt_benchmark.py
+++ b/examples/language/opt/opt_benchmark.py
@@ -1,4 +1,5 @@
 import time
+from contextlib import nullcontext
 
 import torch
 import tqdm
@@ -8,9 +9,11 @@
 from transformers.utils.versions import require_version
 
 import colossalai
+from colossalai.accelerator import get_accelerator
 from colossalai.booster import Booster
 from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin, TorchDDPPlugin
 from colossalai.cluster import DistCoordinator
+from colossalai.lazy import LazyInitContext
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.nn.optimizer import HybridAdam
 
@@ -62,14 +65,6 @@ def main():
     if args.mem_cap > 0:
         colo_memory_cap(args.mem_cap)
 
-    # Build OPT model
-    config = AutoConfig.from_pretrained(args.model_name_or_path)
-    model = OPTForCausalLM(config=config)
-    logger.info(f"Finish loading model from {args.model_name_or_path}", ranks=[0])
-
-    # Enable gradient checkpointing
-    model.gradient_checkpointing_enable()
-
     # Set plugin
     booster_kwargs = {}
     if args.plugin == "torch_ddp_fp16":
@@ -82,6 +77,19 @@ def main():
         plugin = LowLevelZeroPlugin(initial_scale=2**5)
     logger.info(f"Set plugin as {args.plugin}", ranks=[0])
 
+    # Build OPT model
+    init_ctx = (
+        LazyInitContext(default_device=get_accelerator().get_current_device())
+        if isinstance(plugin, (GeminiPlugin))
+        else nullcontext()
+    )
+    config = AutoConfig.from_pretrained(args.model_name_or_path)
+    with init_ctx:
+        model = OPTForCausalLM(config=config)
+    logger.info(f"Finish loading model from {args.model_name_or_path}", ranks=[0])
+
+    # Enable gradient checkpointing
+    model.gradient_checkpointing_enable()
     # Set optimizer
     optimizer = HybridAdam(model.parameters(), lr=args.learning_rate)
 
diff --git a/examples/language/opt/opt_train_demo.py b/examples/language/opt/opt_train_demo.py
index b5b50305cc34..50dfc7bffd07 100644
--- a/examples/language/opt/opt_train_demo.py
+++ b/examples/language/opt/opt_train_demo.py
@@ -1,3 +1,5 @@
+from contextlib import nullcontext
+
 import datasets
 import torch
 import transformers
@@ -8,9 +10,11 @@
 from transformers.utils.versions import require_version
 
 import colossalai
+from colossalai.accelerator import get_accelerator
 from colossalai.booster import Booster
 from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin, TorchDDPPlugin
 from colossalai.cluster import DistCoordinator
+from colossalai.lazy import LazyInitContext
 from colossalai.logging import disable_existing_loggers, get_dist_logger
 from colossalai.nn.optimizer import HybridAdam
 
@@ -78,14 +82,6 @@ def main():
         datasets.utils.logging.set_verbosity_error()
         transformers.utils.logging.set_verbosity_error()
 
-    # Build OPT model
-    config = AutoConfig.from_pretrained(args.model_name_or_path)
-    model = OPTForCausalLM.from_pretrained(args.model_name_or_path, config=config)
-    logger.info(f"Finish loading model from {args.model_name_or_path}", ranks=[0])
-
-    # Enable gradient checkpointing
-    model.gradient_checkpointing_enable()
-
     # Set plugin
     booster_kwargs = {}
     if args.plugin == "torch_ddp_fp16":
@@ -110,6 +106,21 @@ def main():
 
     logger.info(f"Set plugin as {args.plugin}", ranks=[0])
 
+    # Build OPT model
+    config = AutoConfig.from_pretrained(args.model_name_or_path)
+    # Build OPT model
+    init_ctx = (
+        LazyInitContext(default_device=get_accelerator().get_current_device())
+        if isinstance(plugin, (GeminiPlugin, HybridParallelPlugin))
+        else nullcontext()
+    )
+    with init_ctx:
+        model = OPTForCausalLM.from_pretrained(args.model_name_or_path, config=config)
+    logger.info(f"Finish loading model from {args.model_name_or_path}", ranks=[0])
+
+    # Enable gradient checkpointing
+    model.gradient_checkpointing_enable()
+
     # Prepare tokenizer and dataloader
     tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
     dataset = NetflixDataset(tokenizer)
diff --git a/examples/language/performance_evaluator.py b/examples/language/performance_evaluator.py
index 6b8daf37d678..ca4a02cd2981 100644
--- a/examples/language/performance_evaluator.py
+++ b/examples/language/performance_evaluator.py
@@ -113,13 +113,13 @@ def on_step_start(self, step: int) -> None:
         self.disable = self.ignore_steps > 0 and step < self.ignore_steps
         if self.disable:
             return
-        get_accelerator().synchronize()
+        # get_accelerator().synchronize()
         self.timer.start()
 
     def on_step_end(self, input_ids: Tensor, **kwargs) -> None:
         if self.disable:
             return
-        get_accelerator().synchronize()
+        # get_accelerator().synchronize()
         self.timer.end()
 
         batch_size, seq_len = input_ids.shape
diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt
index e4affc7f5396..93a3690fe1d3 100644
--- a/requirements/requirements-test.txt
+++ b/requirements/requirements-test.txt
@@ -1,4 +1,3 @@
-diffusers
 pytest
 coverage==7.2.3
 git+https://github.com/hpcaitech/pytest-testmon
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 27bbc3769448..651eb66e89ab 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -8,7 +8,7 @@ click
 fabric
 contexttimer
 ninja
-torch>=2.1.0,<2.3.0
+torch>=2.1.0,<=2.3.0
 safetensors
 einops
 pydantic
@@ -23,3 +23,4 @@ rpyc==6.0.0
 fastapi
 uvicorn==0.29.0
 galore_torch
+diffusers==0.29.0
diff --git a/tests/kit/model_zoo/transformers/__init__.py b/tests/kit/model_zoo/transformers/__init__.py
index 05c17f562635..4adc386192d3 100644
--- a/tests/kit/model_zoo/transformers/__init__.py
+++ b/tests/kit/model_zoo/transformers/__init__.py
@@ -3,28 +3,17 @@
 from .blip2 import *
 from .bloom import *
 from .chatglm2 import *
+from .command import *
+from .deepseek import *
 from .falcon import *
 from .gpt import *
 from .gptj import *
 from .llama import *
+from .mistral import *
+from .mixtral import *
 from .opt import *
+from .qwen2 import *
 from .sam import *
 from .t5 import *
 from .vit import *
 from .whisper import *
-
-try:
-    from .mistral import *
-except ImportError:
-    print("This version of transformers doesn't support mistral.")
-
-try:
-    from .qwen2 import *
-except ImportError:
-    print("This version of transformers doesn't support qwen2.")
-
-
-try:
-    from .command import *
-except ImportError:
-    print("This version of transformers doesn't support Command-R.")
diff --git a/tests/kit/model_zoo/transformers/deepseek.py b/tests/kit/model_zoo/transformers/deepseek.py
new file mode 100644
index 000000000000..ad73640a57c5
--- /dev/null
+++ b/tests/kit/model_zoo/transformers/deepseek.py
@@ -0,0 +1,83 @@
+# modified from tests/kit/model_zoo/transformers/mistral.py
+import torch
+import transformers
+from transformers import AutoConfig
+
+from ..registry import ModelAttribute, model_zoo
+
+# ===============================
+# Register single-sentence Mixtral
+# ===============================
+
+
+def data_gen():
+    # Generated from following code snippet
+    #
+    # from transformers import AutoModelForCausalLM, AutoTokenizer
+    # tokenizer = AutoTokenizer.from_pretrained("mixtralai/Mixtral-7B-v0.1")
+    # input = 'My favourite condiment is vinegar' (last two words repeated to satisfy length requirement)
+    # tokenized_input = tokenizer([input], return_tensors="pt")
+    # input_ids = tokenized_input['input_ids']
+    # attention_mask = tokenized_input['attention_mask']
+    input_ids = torch.tensor([[1, 22, 55, 77, 532, 349, 43, 22]], dtype=torch.int64)
+    attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)
+    return dict(input_ids=input_ids, attention_mask=attention_mask)
+
+
+def data_gen_for_lm():
+    # LM data gen
+    # the `labels` of LM is the token of the output, cause no padding, use `input_ids` as `labels`
+    data = data_gen()
+    data["labels"] = data["input_ids"].clone()
+    return data
+
+
+def data_gen_for_sequence_classification():
+    # sequence classification data gen
+    data = data_gen()
+    data["labels"] = torch.tensor([1], dtype=torch.int64)
+    return data
+
+
+# define output transform function
+output_transform_fn = lambda x: x
+
+# define loss function
+loss_fn_for_mixtral_model = lambda x: x[0].mean()
+loss_fn = lambda x: x.loss
+loss_fn_for_seq_classification = lambda output: output.logits.mean()
+
+
+def init_deepseek():
+
+    config = AutoConfig.from_pretrained(
+        "deepseek-ai/deepseek-moe-16b-base",
+        hidden_size=32,
+        intermediate_size=32,
+        moe_intermediate_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=8,
+        num_key_value_heads=8,
+        # vocab_size=2200,
+        first_k_dense_replace=1,
+        attn_implementation="flash_attention_2",
+        torch_dtype="float16",
+        n_routed_experts=8,
+        trust_remote_code=True,
+    )
+
+    if hasattr(config, "pad_token_id"):
+        config.pad_token_id = config.eos_token_id
+    model = transformers.AutoModel.from_config(config, trust_remote_code=True)
+
+    return model
+
+
+model_zoo.register(
+    name="transformers_deepseek",
+    model_fn=init_deepseek,
+    data_gen_fn=data_gen,
+    output_transform_fn=output_transform_fn,
+    loss_fn=loss_fn_for_mixtral_model,
+    model_attribute=ModelAttribute(has_control_flow=True),
+)
diff --git a/tests/kit/model_zoo/transformers/mixtral.py b/tests/kit/model_zoo/transformers/mixtral.py
new file mode 100644
index 000000000000..73c0e9e2c500
--- /dev/null
+++ b/tests/kit/model_zoo/transformers/mixtral.py
@@ -0,0 +1,85 @@
+# modified from tests/kit/model_zoo/transformers/mistral.py
+import torch
+import transformers
+from transformers import MixtralConfig
+
+from ..registry import ModelAttribute, model_zoo
+
+# ===============================
+# Register single-sentence Mixtral
+# ===============================
+
+
+def data_gen():
+    # Generated from following code snippet
+    #
+    # from transformers import AutoModelForCausalLM, AutoTokenizer
+    # tokenizer = AutoTokenizer.from_pretrained("mixtralai/Mixtral-7B-v0.1")
+    # input = 'My favourite condiment is vinegar' (last two words repeated to satisfy length requirement)
+    # tokenized_input = tokenizer([input], return_tensors="pt")
+    # input_ids = tokenized_input['input_ids']
+    # attention_mask = tokenized_input['attention_mask']
+    input_ids = torch.tensor([[1, 22, 55, 77, 532, 349, 43, 22]], dtype=torch.int64)
+    attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)
+    return dict(input_ids=input_ids, attention_mask=attention_mask)
+
+
+def data_gen_for_lm():
+    # LM data gen
+    # the `labels` of LM is the token of the output, cause no padding, use `input_ids` as `labels`
+    data = data_gen()
+    data["labels"] = data["input_ids"].clone()
+    return data
+
+
+def data_gen_for_sequence_classification():
+    # sequence classification data gen
+    data = data_gen()
+    data["labels"] = torch.tensor([1], dtype=torch.int64)
+    return data
+
+
+# define output transform function
+output_transform_fn = lambda x: x
+
+# define loss function
+loss_fn_for_mixtral_model = lambda x: x[0].mean()
+loss_fn = lambda x: x.loss
+loss_fn_for_seq_classification = lambda output: output.logits.mean()
+
+config = MixtralConfig(
+    hidden_size=32,
+    intermediate_size=32,
+    num_attention_heads=8,
+    num_hidden_layers=2,
+    vocab_size=1000,
+    output_router_logits=True,
+)
+
+if hasattr(config, "pad_token_id"):
+    config.pad_token_id = config.eos_token_id
+
+model_zoo.register(
+    name="transformers_mixtral",
+    model_fn=lambda: transformers.MixtralModel(config),
+    data_gen_fn=data_gen,
+    output_transform_fn=output_transform_fn,
+    loss_fn=loss_fn_for_mixtral_model,
+    model_attribute=ModelAttribute(has_control_flow=True),
+)
+# model_zoo.register(
+#     name="transformers_mixtral_for_casual_lm",
+#     model_fn=lambda: transformers.MixtralForCausalLM(config),
+#     data_gen_fn=data_gen_for_lm,
+#     output_transform_fn=output_transform_fn,
+#     loss_fn=loss_fn,
+#     model_attribute=ModelAttribute(has_control_flow=True),
+# )
+# model_zoo.register(
+#     name="transformers_mixtral_for_sequence_classification",
+#     model_fn=lambda: transformers.MixtralForSequenceClassification(config),
+#     data_gen_fn=data_gen_for_sequence_classification,
+#     output_transform_fn=output_transform_fn,
+#     loss_fn=loss_fn_for_seq_classification,
+#     model_attribute=ModelAttribute(has_control_flow=True),
+# )
diff --git a/tests/test_legacy/test_moe/moe_utils.py b/tests/test_legacy/test_moe/moe_utils.py
new file mode 100644
index 000000000000..8c133849b000
--- /dev/null
+++ b/tests/test_legacy/test_moe/moe_utils.py
@@ -0,0 +1,136 @@
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.distributed import ProcessGroup
+
+from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel
+from colossalai.legacy.engine.gradient_handler._base_gradient_handler import BaseGradientHandler
+from colossalai.legacy.engine.gradient_handler.utils import bucket_allreduce
+from colossalai.legacy.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.utils import get_moe_epsize_param_dict
+from colossalai.legacy.registry import GRADIENT_HANDLER
+from colossalai.tensor.moe_tensor.api import get_ep_group, get_ep_size, set_moe_tensor_ep_group
+
+
+def delete_moe_info(model):
+    for _, param in model.named_parameters():
+        if hasattr(param, "ep_group"):
+            delattr(param, "ep_group")
+
+
+class MoeModel(nn.Module):
+    def __init__(self, ep_group: ProcessGroup = None):
+        super().__init__()
+        self.test_embed = nn.Linear(4, 16, bias=False)
+        self.w1 = torch.nn.Parameter(torch.randn(16, 8))
+        if ep_group:
+            set_moe_tensor_ep_group(self.w1, ep_group)
+
+    def forward(self, x):
+        x = self.test_embed(x)
+        x = torch.matmul(x, self.w1)
+
+        return x
+
+
+@GRADIENT_HANDLER.register_module
+class MoeGradientHandler(BaseGradientHandler):
+    """A helper class to handle all-reduce operations in a data parallel group and
+    moe model parallel. A all-reduce collective communication will be operated in
+    :func:`handle_gradient` among a data parallel group.
+    For better performance, it bucketizes the gradients of all parameters that are
+    the same type to improve the efficiency of communication.
+
+    Args:
+        model (Module): Model where the gradients accumulate.
+        optimizer (Optimizer): Optimizer for updating the parameters.
+    """
+
+    def __init__(self, model, optimizer=None):
+        super().__init__(model, optimizer)
+
+    def handle_gradient(self):
+        """A method running an all-reduce operation in a data parallel group.
+        Then running an all-reduce operation for all parameters in experts
+        across moe model parallel group
+        """
+        if dist.get_world_size() > 1:
+            epsize_param_dict = get_moe_epsize_param_dict(self._model)
+
+            # epsize is 1, indicating the params are replicated among processes in data parallelism
+            # use the ParallelMode.DATA to get data parallel group
+            # reduce gradients for all parameters in data parallelism
+            if 1 in epsize_param_dict:
+                bucket_allreduce(param_list=epsize_param_dict[1])
+
+            for ep_size in epsize_param_dict:
+                if ep_size != 1 and ep_size != MOE_MANAGER.world_size:
+                    bucket_allreduce(
+                        param_list=epsize_param_dict[ep_size], group=MOE_MANAGER.parallel_info_dict[ep_size].dp_group
+                    )
+
+
+def assert_not_equal_in_group(tensor, process_group=None):
+    # all gather tensors from different ranks
+    world_size = dist.get_world_size(process_group)
+    tensor_list = [torch.empty_like(tensor) for _ in range(world_size)]
+    dist.all_gather(tensor_list, tensor, group=process_group)
+
+    # check if they are equal one by one
+    for i in range(world_size - 1):
+        a = tensor_list[i]
+        b = tensor_list[i + 1]
+        assert not torch.allclose(a, b), (
+            f"expected tensors on rank {i} and {i + 1} not to be equal " f"but they are, {a} vs {b}"
+        )
+
+
+def run_fwd_bwd(model, data, label, criterion, optimizer, enable_autocast=False):
+    model.train()
+    with torch.cuda.amp.autocast(enabled=enable_autocast):
+        if criterion:
+            y = model(data)
+            loss = criterion(y, label)
+        else:
+            loss = model(data, label)
+        loss = loss.float()
+
+    if isinstance(model, LowLevelZeroModel):
+        optimizer.backward(loss)
+    else:
+        loss.backward()
+    return y
+
+
+def sync_local_from_ep(local_model, ep_model, assert_grad_flag: bool = False) -> None:
+    """Sync the parameters of tp model from ep model
+
+    Args:
+        local_model (MoeModule)
+        ep_model (MoeModule)
+    """
+    for (local_name, local_param), (ep_name, ep_param) in zip(
+        local_model.named_parameters(), ep_model.named_parameters()
+    ):
+        if "experts" not in local_name:
+            if assert_grad_flag:
+                assert torch.allclose(local_param, ep_param), f"local_param: {local_param}, ep_param: {ep_param}"
+                assert torch.allclose(local_param.grad, ep_param.grad)
+            else:
+                local_param.data.copy_(ep_param.data)
+            continue
+
+        # gather param from ep model
+        param_list = [torch.zeros_like(ep_param) for _ in range(get_ep_size(ep_param))]
+        dist.all_gather(param_list, ep_param, group=get_ep_group(ep_param))
+        all_param = torch.cat(param_list, dim=0)
+        if assert_grad_flag:
+            grad_list = [torch.zeros_like(ep_param) for _ in range(get_ep_size(ep_param))]
+            dist.all_gather(grad_list, ep_param.grad, group=get_ep_group(ep_param))
+            all_grad = torch.cat(grad_list, dim=0)
+
+        if assert_grad_flag:
+            assert torch.allclose(local_param, all_param)
+            assert torch.allclose(local_param.grad, all_grad)
+        else:
+            local_param.data.copy_(all_param.data)
diff --git a/tests/test_moe/test_grad_handler.py b/tests/test_legacy/test_moe/test_grad_handler.py
similarity index 98%
rename from tests/test_moe/test_grad_handler.py
rename to tests/test_legacy/test_moe/test_grad_handler.py
index 25e61b091729..3a782a6dd445 100644
--- a/tests/test_moe/test_grad_handler.py
+++ b/tests/test_legacy/test_moe/test_grad_handler.py
@@ -5,7 +5,7 @@
 
 import colossalai
 from colossalai.accelerator import get_accelerator
-from colossalai.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.manager import MOE_MANAGER
 
 # from colossalai.shardformer.layer.moe.layers import SparseMLP
 from colossalai.testing import assert_equal_in_group, rerun_if_address_is_in_use, spawn
diff --git a/tests/test_moe/test_moe_group.py b/tests/test_legacy/test_moe/test_moe_group.py
similarity index 95%
rename from tests/test_moe/test_moe_group.py
rename to tests/test_legacy/test_moe/test_moe_group.py
index 89baf1d37b1b..68dac4828fa7 100644
--- a/tests/test_moe/test_moe_group.py
+++ b/tests/test_legacy/test_moe/test_moe_group.py
@@ -4,8 +4,8 @@
 
 import colossalai
 from colossalai.accelerator import get_accelerator
-from colossalai.moe.manager import MOE_MANAGER
-from colossalai.moe.utils import sync_moe_model_param
+from colossalai.legacy.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.utils import sync_moe_model_param
 
 # from colossalai.shardformer.layer.moe import MLPExperts
 from colossalai.testing import assert_equal_in_group, rerun_if_address_is_in_use, spawn
diff --git a/tests/test_moe/test_moe_hybrid_zero.py b/tests/test_legacy/test_moe/test_moe_hybrid_zero.py
similarity index 98%
rename from tests/test_moe/test_moe_hybrid_zero.py
rename to tests/test_legacy/test_moe/test_moe_hybrid_zero.py
index 513c4ebda4a5..fdd6d956ef83 100644
--- a/tests/test_moe/test_moe_hybrid_zero.py
+++ b/tests/test_legacy/test_moe/test_moe_hybrid_zero.py
@@ -6,7 +6,7 @@
 from colossalai.booster import Booster
 from colossalai.booster.plugin import LowLevelZeroPlugin
 from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel
-from colossalai.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.manager import MOE_MANAGER
 from colossalai.tensor.moe_tensor.api import is_moe_tensor
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 from tests.test_moe.moe_utils import MoeModel
diff --git a/tests/test_moe/test_moe_load_balance.py b/tests/test_legacy/test_moe/test_moe_load_balance.py
similarity index 99%
rename from tests/test_moe/test_moe_load_balance.py
rename to tests/test_legacy/test_moe/test_moe_load_balance.py
index ddd3ea368964..adf2dbc1ccf3 100644
--- a/tests/test_moe/test_moe_load_balance.py
+++ b/tests/test_legacy/test_moe/test_moe_load_balance.py
@@ -6,7 +6,7 @@
 from colossalai.booster import Booster
 from colossalai.booster.plugin import LowLevelZeroPlugin
 from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel
-from colossalai.moe.manager import MOE_MANAGER
+from colossalai.legacy.moe.manager import MOE_MANAGER
 
 # from colossalai.shardformer.layer.moe import apply_load_balance
 from colossalai.tensor.moe_tensor.api import is_moe_tensor
diff --git a/tests/test_lora/test_lora.py b/tests/test_lora/test_lora.py
index b8daf775db0e..1ae17025d31e 100644
--- a/tests/test_lora/test_lora.py
+++ b/tests/test_lora/test_lora.py
@@ -9,7 +9,8 @@
 
 import colossalai
 from colossalai.booster import Booster
-from colossalai.booster.plugin import LowLevelZeroPlugin, TorchDDPPlugin
+from colossalai.booster.plugin import HybridParallelPlugin, LowLevelZeroPlugin, TorchDDPPlugin
+from colossalai.booster.plugin.hybrid_parallel_plugin import HybridParallelModule
 from colossalai.testing import check_state_dict_equal, clear_cache_before_run, rerun_if_address_is_in_use, spawn
 from tests.kit.model_zoo import model_zoo
 from tests.test_checkpoint_io.utils import shared_tempdir
@@ -20,7 +21,7 @@ def check_fwd_bwd(model_fn, data_gen_fn, output_transform_fn, loss_fn, task_type
     model = model_fn()
     lora_config = LoraConfig(task_type=task_type, r=8, lora_alpha=32, lora_dropout=0.1)
 
-    test_plugins = [TorchDDPPlugin(), LowLevelZeroPlugin()]
+    test_plugins = [TorchDDPPlugin(), LowLevelZeroPlugin(), HybridParallelPlugin(tp_size=1, pp_size=1)]
     test_configs = [
         {
             "lora_config": lora_config,
@@ -59,6 +60,8 @@ def check_fwd_bwd(model_fn, data_gen_fn, output_transform_fn, loss_fn, task_type
 
         # test fwd bwd correctness
         test_model = model_load
+        if isinstance(model_load, HybridParallelModule):
+            model_load = model_load.module.module
         model_copy = copy.deepcopy(model_load)
 
         data = data_gen_fn()
diff --git a/tests/test_moe/moe_utils.py b/tests/test_moe/moe_utils.py
index 131932dcb3b3..8c411a33fef6 100644
--- a/tests/test_moe/moe_utils.py
+++ b/tests/test_moe/moe_utils.py
@@ -1,142 +1,8 @@
 import torch
-import torch.distributed as dist
-import torch.nn as nn
-from torch.distributed import ProcessGroup
-from torch.testing import assert_close
 
-from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel
-from colossalai.legacy.engine.gradient_handler._base_gradient_handler import BaseGradientHandler
-from colossalai.legacy.engine.gradient_handler.utils import bucket_allreduce
-from colossalai.legacy.registry import GRADIENT_HANDLER
-from colossalai.moe.manager import MOE_MANAGER
-from colossalai.moe.utils import get_moe_epsize_param_dict
 
-# from colossalai.shardformer.layer.moe import SparseMLP
-from colossalai.tensor.moe_tensor.api import get_ep_group, get_ep_size, set_moe_tensor_ep_group
-
-
-def delete_moe_info(model):
-    for _, param in model.named_parameters():
-        if hasattr(param, "ep_group"):
-            delattr(param, "ep_group")
-
-
-class MoeModel(nn.Module):
-    def __init__(self, ep_group: ProcessGroup = None):
-        super().__init__()
-        self.test_embed = nn.Linear(4, 16, bias=False)
-        self.w1 = torch.nn.Parameter(torch.randn(16, 8))
-        if ep_group:
-            set_moe_tensor_ep_group(self.w1, ep_group)
-
-    def forward(self, x):
-        x = self.test_embed(x)
-        x = torch.matmul(x, self.w1)
-
-        return x
-
-
-@GRADIENT_HANDLER.register_module
-class MoeGradientHandler(BaseGradientHandler):
-    """A helper class to handle all-reduce operations in a data parallel group and
-    moe model parallel. A all-reduce collective communication will be operated in
-    :func:`handle_gradient` among a data parallel group.
-    For better performance, it bucketizes the gradients of all parameters that are
-    the same type to improve the efficiency of communication.
-
-    Args:
-        model (Module): Model where the gradients accumulate.
-        optimizer (Optimizer): Optimizer for updating the parameters.
-    """
-
-    def __init__(self, model, optimizer=None):
-        super().__init__(model, optimizer)
-
-    def handle_gradient(self):
-        """A method running an all-reduce operation in a data parallel group.
-        Then running an all-reduce operation for all parameters in experts
-        across moe model parallel group
-        """
-        if dist.get_world_size() > 1:
-            epsize_param_dict = get_moe_epsize_param_dict(self._model)
-
-            # epsize is 1, indicating the params are replicated among processes in data parallelism
-            # use the ParallelMode.DATA to get data parallel group
-            # reduce gradients for all parameters in data parallelism
-            if 1 in epsize_param_dict:
-                bucket_allreduce(param_list=epsize_param_dict[1])
-
-            for ep_size in epsize_param_dict:
-                if ep_size != 1 and ep_size != MOE_MANAGER.world_size:
-                    bucket_allreduce(
-                        param_list=epsize_param_dict[ep_size], group=MOE_MANAGER.parallel_info_dict[ep_size].dp_group
-                    )
-
-
-def assert_not_equal_in_group(tensor, process_group=None):
-    # all gather tensors from different ranks
-    world_size = dist.get_world_size(process_group)
-    tensor_list = [torch.empty_like(tensor) for _ in range(world_size)]
-    dist.all_gather(tensor_list, tensor, group=process_group)
-
-    # check if they are equal one by one
-    for i in range(world_size - 1):
-        a = tensor_list[i]
-        b = tensor_list[i + 1]
-        assert not torch.allclose(a, b), (
-            f"expected tensors on rank {i} and {i + 1} not to be equal " f"but they are, {a} vs {b}"
-        )
-
-
-def run_fwd_bwd(model, data, label, criterion, optimizer, enable_autocast=False):
-    model.train()
-    with torch.cuda.amp.autocast(enabled=enable_autocast):
-        if criterion:
-            y = model(data)
-            loss = criterion(y, label)
-        else:
-            loss = model(data, label)
-        loss = loss.float()
-
-    if isinstance(model, LowLevelZeroModel):
-        optimizer.backward(loss)
-    else:
-        loss.backward()
-    return y
-
-
-def sync_local_from_ep(local_model, ep_model, assert_grad_flag: bool = False) -> None:
-    """Sync the parameters of tp model from ep model
-
-    Args:
-        local_model (MoeModule)
-        ep_model (MoeModule)
-    """
-    for (local_name, local_param), (ep_name, ep_param) in zip(
-        local_model.named_parameters(), ep_model.named_parameters()
-    ):
-        if "experts" not in local_name:
-            if assert_grad_flag:
-                assert torch.allclose(local_param, ep_param), f"local_param: {local_param}, ep_param: {ep_param}"
-                assert torch.allclose(local_param.grad, ep_param.grad)
-            else:
-                local_param.data.copy_(ep_param.data)
-            continue
-
-        # gather param from ep model
-        param_list = [torch.zeros_like(ep_param) for _ in range(get_ep_size(ep_param))]
-        dist.all_gather(param_list, ep_param, group=get_ep_group(ep_param))
-        all_param = torch.cat(param_list, dim=0)
-        if assert_grad_flag:
-            grad_list = [torch.zeros_like(ep_param) for _ in range(get_ep_size(ep_param))]
-            dist.all_gather(grad_list, ep_param.grad, group=get_ep_group(ep_param))
-            all_grad = torch.cat(grad_list, dim=0)
-
-        if assert_grad_flag:
-            assert torch.allclose(local_param, all_param)
-            assert torch.allclose(local_param.grad, all_grad)
-        else:
-            local_param.data.copy_(all_param.data)
+def assert_loose_close(a, b, dtype: torch.dtype = torch.float32, name=""):
+    assert loose_close(a, b, dtype), f"{name} not close {a.mean()} {b.mean()}"
 
 
 def loose_close(a, b, dtype: torch.dtype = torch.float32):
@@ -148,8 +14,18 @@ def loose_close(a, b, dtype: torch.dtype = torch.float32):
     elif dtype is torch.bfloat16:
         rtol = 4e-3
         atol = 4e-3
+    else:
+        assert dtype is torch.float32
+        rtol = 1e-05
+        atol = 1e-08
 
     a = a.detach().to(dtype)
     b = b.detach().to(dtype).to(a.device)
 
-    assert_close(a, b, rtol=rtol, atol=atol)
+    return torch.allclose(a, b, rtol=rtol, atol=atol)
+
+
+def check_model_equal(model1, model2):
+    assert set(model1.state_dict().keys()) == set(model2.state_dict().keys())
+    for i, ((name, p1), p2) in enumerate(zip(model1.named_parameters(), model2.parameters())):
+        assert_loose_close(p1, p2, p1.dtype)
diff --git a/tests/test_moe/test_deepseek_layer.py b/tests/test_moe/test_deepseek_layer.py
new file mode 100644
index 000000000000..d18ba2eacd84
--- /dev/null
+++ b/tests/test_moe/test_deepseek_layer.py
@@ -0,0 +1,78 @@
+from copy import deepcopy
+
+import pytest
+import torch
+import torch.distributed as dist
+from torch.testing import assert_close
+from transformers import AutoConfig, AutoModel
+
+import colossalai
+from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
+from colossalai.shardformer.modeling.deepseek import EPDeepseekMoE
+from colossalai.testing.utils import spawn
+
+tokens, n_experts = 7, 4
+hidden_size = 8
+top_k = 2
+
+
+def check_deepseek_moe_layer():
+    torch.cuda.set_device(dist.get_rank())
+    plugin = MoeHybridParallelPlugin(
+        precision="bf16",
+        tp_size=1,
+        pp_size=1,
+        zero_stage=1,
+        ep_size=dist.get_world_size(),
+    )
+
+    config = AutoConfig.from_pretrained(
+        "deepseek-ai/deepseek-moe-16b-base",
+        num_hidden_layers=1,
+        n_routed_experts=n_experts,
+        num_experts_per_tok=top_k,
+        hidden_size=hidden_size,
+        intermediate_size=hidden_size * 2,
+        first_k_dense_replace=0,
+        num_attention_heads=2,
+        trust_remote_code=True,
+    )
+    torch.manual_seed(0)
+    # get the moe layer in auto model
+    orig_model = AutoModel.from_config(config, trust_remote_code=True).layers[0].mlp.cuda()
+    x = torch.rand(1, tokens, hidden_size, requires_grad=True).cuda()
+    orig_output = orig_model(x)
+    model = deepcopy(orig_model)
+    model = EPDeepseekMoE.from_native_module(
+        model,
+        ep_group=plugin.ep_group,
+        moe_dp_group=plugin.moe_dp_group,
+        tp_group=plugin.tp_group,
+    )
+    ep_output = model(x)
+    assert_close(orig_output, ep_output)
+    orig_loss = orig_output.mean()
+    orig_loss.backward()
+    ep_loss = ep_output.mean()
+    ep_loss.backward()
+    assert_close(orig_loss, ep_loss)
+    name_to_p = {n: p for n, p in orig_model.named_parameters()}
+    for n, ep_p in model.named_parameters():
+        p = name_to_p[n]
+        if ep_p.grad is not None:
+            assert_close(p.grad, ep_p.grad)
+
+
+def run_dist(rank: int, world_size: int, port: int):
+    colossalai.launch(rank, world_size, "localhost", port)
+    check_deepseek_moe_layer()
+
+
+@pytest.mark.skip("tested in corresponding sharderformer")
+@pytest.mark.parametrize("world_size", [2])
+def test_deepseek_moe_layer(world_size: int):
+    spawn(run_dist, world_size)
+
+
+if __name__ == "__main__":
+    test_deepseek_moe_layer(2)
diff --git a/tests/test_moe/test_kernel.py b/tests/test_moe/test_kernel.py
index 28e6db441411..c81023988377 100644
--- a/tests/test_moe/test_kernel.py
+++ b/tests/test_moe/test_kernel.py
@@ -4,8 +4,6 @@
 import torch
 
 from colossalai.accelerator import get_accelerator
-
-# from colossalai.moe import SparseMLP
 from colossalai.moe._operation import MoeCombine, MoeDispatch, moe_cumsum
 
 NUM_EXPERTS = 4
diff --git a/tests/test_moe/test_mixtral_layer.py b/tests/test_moe/test_mixtral_layer.py
index b7b0322e08b5..bc41ac4f33e9 100644
--- a/tests/test_moe/test_mixtral_layer.py
+++ b/tests/test_moe/test_mixtral_layer.py
@@ -23,6 +23,7 @@ def check_mixtral_moe_layer():
         precision="bf16",
         tp_size=1,
         pp_size=1,
+        zero_stage=1,
         ep_size=dist.get_world_size(),
     )
     config = MixtralConfig(
@@ -36,7 +37,12 @@ def check_mixtral_moe_layer():
     x = torch.rand(1, tokens, hidden_size, requires_grad=True).cuda()
     orig_output, orig_logits = orig_model(x)
     model = deepcopy(orig_model)
-    model = EPMixtralSparseMoeBlock.from_native_module(model, ep_group=plugin.ep_group)
+    model = EPMixtralSparseMoeBlock.from_native_module(
+        model,
+        ep_group=plugin.ep_group,
+        tp_group=plugin.tp_group,
+        moe_dp_group=plugin.moe_dp_group,
+    )
     ep_output, ep_logits = model(x)
     assert_close(orig_logits, ep_logits)
     assert_close(orig_output, ep_output)
@@ -57,7 +63,8 @@ def run_dist(rank: int, world_size: int, port: int):
     check_mixtral_moe_layer()
 
 
-@pytest.mark.parametrize("world_size", [2, 4])
+@pytest.mark.skip("tested in corresponding sharderformer")
+@pytest.mark.parametrize("world_size", [2])
 def test_mixtral_moe_layer(world_size: int):
     spawn(run_dist, world_size)
 
diff --git a/tests/test_moe/test_moe_checkpoint.py b/tests/test_moe/test_moe_checkpoint.py
index 249dd4b971c5..89f5d1c64d0d 100644
--- a/tests/test_moe/test_moe_checkpoint.py
+++ b/tests/test_moe/test_moe_checkpoint.py
@@ -6,30 +6,23 @@
 import pytest
 import torch
 import torch.distributed as dist
-from torch.optim import Adam
+from torch.optim import SGD, Adam
 from transformers.models.mixtral.configuration_mixtral import MixtralConfig
 from transformers.models.mixtral.modeling_mixtral import MixtralForCausalLM
 
 import colossalai
 from colossalai.booster import Booster
 from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
-from colossalai.checkpoint_io import MoECheckpointIO
-from colossalai.tensor.moe_tensor.api import is_moe_tensor
+from colossalai.testing import parameterize, spawn
+from colossalai.testing.random import seed_all
 from colossalai.testing.utils import spawn
+from tests.test_moe.moe_utils import check_model_equal
 
 tokens, n_experts = 7, 4
 hidden_size = 8
 top_k = 2
 
 
-def check_model_equal(model1, model2):
-    assert set(model1.state_dict().keys()) == set(model2.state_dict().keys())
-    for i, ((name, p1), p2) in enumerate(zip(model1.named_parameters(), model2.parameters())):
-        if not torch.equal(p1.half(), p2.half()):
-            print(f"Model parameter {name} is not equal. is_moe_tensor: {is_moe_tensor(p1)}")
-            raise AssertionError(f"Model parameter {name} is not equal")
-
-
 def get_optimizer_snapshot(optim):
     state = {id(k): deepcopy(v) for k, v in optim.state.items()}
     param_groups = []
@@ -77,36 +70,44 @@ def check_optimizer_snapshot_equal(snapshot1, snapshot2, param2name, moe_dp_grou
         raise AssertionError(f"A total of {count} optim states are not equal")
 
 
-def check_mixtral_moe_layer():
+@parameterize(
+    "test_config",
+    [
+        [
+            MixtralConfig(
+                hidden_size=hidden_size,
+                intermediate_size=hidden_size * 2,
+                num_local_experts=n_experts,
+                num_experts_per_tok=top_k,
+                num_attention_heads=2,
+                num_key_value_heads=2,
+                num_hidden_layers=2,
+            ),
+            MixtralForCausalLM,
+        ],
+    ],
+)
+def check_moe_checkpoint(test_config):
+    dtype, precision = torch.float16, "fp16"
+    config, model_cls = test_config
+    torch.cuda.set_device(dist.get_rank())
+
     context = tempfile.TemporaryDirectory() if dist.get_rank() == 0 else nullcontext()
     with context as f:
-        torch.cuda.set_device(dist.get_rank())
         if dist.get_rank() == 0:
             broadcast_objects = [f]  # any picklable object
         else:
             broadcast_objects = [None]
         dist.broadcast_object_list(broadcast_objects, src=0)
 
-        config = MixtralConfig(
-            hidden_size=hidden_size,
-            intermediate_size=hidden_size * 2,
-            num_local_experts=n_experts,
-            num_experts_per_tok=top_k,
-            num_attention_heads=2,
-            num_key_value_heads=2,
-        )
-        torch.manual_seed(0)
         input_ids = torch.randint(0, 100, (2, tokens)).cuda()
-        orig_model = MixtralForCausalLM(config).cuda()
+        orig_model = model_cls(config).cuda().to(dtype)
+
+        seed_all(10086)
         model = deepcopy(orig_model)
-        optimizer = Adam(model.parameters(), lr=1e-3)
+        optimizer = SGD(model.parameters(), lr=1e-3)
         plugin = MoeHybridParallelPlugin(
-            pp_size=2,
-            ep_size=2,
-            tp_size=1,
-            checkpoint_io=MoECheckpointIO,
-            microbatch_size=1,
-            zero_stage=1,
+            pp_size=2, ep_size=2, tp_size=1, microbatch_size=1, zero_stage=1, precision=precision
         )
         booster = Booster(plugin=plugin)
         model, optimizer, *_ = booster.boost(model=model, optimizer=optimizer)
@@ -120,7 +121,6 @@ def check_mixtral_moe_layer():
             lambda outputs, inputs: outputs.loss,
             optimizer,
         )
-
         tmpdirname = broadcast_objects[0]
         model_dir = os.path.join(tmpdirname, "mixtral_model")
         hf_model_dir = os.path.join(tmpdirname, "mixtral_hf_model")
@@ -129,13 +129,12 @@ def check_mixtral_moe_layer():
         booster.save_model(model, model_dir, shard=True)
         dist.barrier()
         if dist.get_rank() == 0:
-            saved_model = MixtralForCausalLM.from_pretrained(model_dir).cuda()
+            saved_model = model_cls.from_pretrained(model_dir).cuda().to(dtype)
             check_model_equal(orig_model, saved_model)
-            # check_model_equal(model, saved_model)
             saved_model.save_pretrained(hf_model_dir)
         dist.barrier()
         # check load model
-        new_model = MixtralForCausalLM(config).cuda()
+        new_model = model_cls(config).cuda().to(dtype)
         new_optimizer = Adam(new_model.parameters(), lr=1e-3)
         new_model, new_optimizer, *_ = booster.boost(model=new_model, optimizer=new_optimizer)
         booster.load_model(new_model, hf_model_dir)
@@ -163,7 +162,7 @@ def check_mixtral_moe_layer():
 
 def run_dist(rank: int, world_size: int, port: int):
     colossalai.launch(rank, world_size, "localhost", port)
-    check_mixtral_moe_layer()
+    check_moe_checkpoint()
 
 
 # Test EP + ZeRO + PP
diff --git a/tests/test_moe/test_moe_ep_tp.py b/tests/test_moe/test_moe_ep_tp.py
index 9bc11033af6f..e6d2609ee67c 100644
--- a/tests/test_moe/test_moe_ep_tp.py
+++ b/tests/test_moe/test_moe_ep_tp.py
@@ -1,238 +1,132 @@
-import os
-import warnings
-from typing import Dict
+from copy import deepcopy
 
 import pytest
 import torch
 import torch.distributed as dist
+from transformers.models.mixtral.configuration_mixtral import MixtralConfig
+from transformers.models.mixtral.modeling_mixtral import MixtralModel
 
 import colossalai
-from colossalai.accelerator import get_accelerator
-from colossalai.moe.manager import MOE_MANAGER
-from colossalai.moe.utils import sync_moe_model_param
-
-# from colossalai.shardformer.layer import SparseMLP
-from colossalai.tensor.moe_tensor.api import get_ep_group, get_ep_rank, get_ep_size, is_moe_tensor
-from colossalai.testing import assert_equal_in_group, rerun_if_address_is_in_use, spawn
-from tests.test_moe.moe_utils import MoeGradientHandler
-
-
-def sync_tp_from_local(tp_model, local_model, assert_grad_flag: bool = False) -> None:
-    """Sync the parameters of tp model from local model
-
-    Args:
-        tp_model (MoeModule)
-        local_model (MoeModule)
-    """
-    for (tp_name, tp_param), (local_name, local_param) in zip(
-        tp_model.named_parameters(), local_model.named_parameters()
-    ):
-        assert tp_name == local_name
-        if not is_moe_tensor(tp_param):
-            if assert_grad_flag:
-                assert torch.allclose(tp_param, local_param)
-                assert torch.allclose(tp_param.grad, local_param.grad)
-            else:
-                tp_param.data.copy_(local_param.data)
-            continue
-
-        tp_rank = get_ep_rank(tp_param)
-        tp_dim = [i for i, (d1, d2) in enumerate(zip(tp_param.shape, local_param.shape)) if d1 != d2][0]
-        tp_slice = [slice(None)] * tp_dim + [
-            slice(tp_param.shape[tp_dim] * tp_rank, tp_param.shape[tp_dim] * (tp_rank + 1))
-        ]
-
-        if assert_grad_flag:
-            assert torch.allclose(tp_param, local_param[tuple(tp_slice)])
-            assert torch.allclose(tp_param.grad, local_param.grad[tuple(tp_slice)])
-        else:
-            tp_param.data.copy_(local_param[tuple(tp_slice)].data)
-
-
-def sync_tp_from_ep(tp_model, ep_model, assert_grad_flag: bool = False) -> None:
-    """Sync the parameters of tp model from ep model
-
-    Args:
-        tp_model (MoeModule)
-        ep_model (MoeModule)
-    """
-    for (tp_name, tp_param), (ep_name, ep_param) in zip(tp_model.named_parameters(), ep_model.named_parameters()):
-        assert tp_name == ep_name
-        if not is_moe_tensor(tp_param):
-            if assert_grad_flag:
-                assert torch.allclose(tp_param, ep_param)
-                assert torch.allclose(tp_param.grad, ep_param.grad)
-            else:
-                tp_param.data.copy_(ep_param.data)
-            continue
-
-        # gather param from ep model
-        param_list = [torch.zeros_like(ep_param) for _ in range(get_ep_size(ep_param))]
-        dist.all_gather(param_list, ep_param, group=get_ep_group(ep_param))
-        all_param = torch.cat(param_list, dim=0)
-        if assert_grad_flag:
-            grad_list = [torch.zeros_like(ep_param) for _ in range(get_ep_size(ep_param))]
-            dist.all_gather(grad_list, ep_param.grad, group=get_ep_group(ep_param))
-            all_grad = torch.cat(grad_list, dim=0)
-
-        # get tp param
-        tp_dim = [i for i, (d1, d2) in enumerate(zip(tp_param.shape[1:], all_param.shape[1:])) if d1 != d2][0] + 1
-        tp_rank = get_ep_rank(tp_param)
-        tp_slice = [slice(None)] * tp_dim + [
-            slice(tp_param.shape[tp_dim] * tp_rank, tp_param.shape[tp_dim] * (tp_rank + 1))
-        ]
-        new_tp_param = all_param[tuple(tp_slice)]
-        if assert_grad_flag:
-            new_grad = all_grad[tuple(tp_slice)]
-        if assert_grad_flag:
-            assert torch.allclose(tp_param, new_tp_param)
-            assert torch.allclose(tp_param.grad, new_grad)
-        else:
-            tp_param.data.copy_(new_tp_param.data)
-
-
-def sync_local_from_ep(local_model, ep_model, assert_grad_flag: bool = False) -> None:
-    """Sync the parameters of tp model from ep model
-
-    Args:
-        local_model (MoeModule)
-        ep_model (MoeModule)
-    """
-    for (local_name, local_param), (ep_name, ep_param) in zip(
-        local_model.named_parameters(), ep_model.named_parameters()
-    ):
-        assert local_name == ep_name
-        if "experts" not in local_name:
-            if assert_grad_flag:
-                assert torch.allclose(local_param, ep_param)
-                assert torch.allclose(local_param.grad, ep_param.grad)
-            else:
-                local_param.data.copy_(ep_param.data)
-            continue
-
-        # gather param from ep model
-        param_list = [torch.zeros_like(ep_param) for _ in range(get_ep_size(ep_param))]
-        dist.all_gather(param_list, ep_param, group=get_ep_group(ep_param))
-        all_param = torch.cat(param_list, dim=0)
-        if assert_grad_flag:
-            grad_list = [torch.zeros_like(ep_param) for _ in range(get_ep_size(ep_param))]
-            dist.all_gather(grad_list, ep_param.grad, group=get_ep_group(ep_param))
-            all_grad = torch.cat(grad_list, dim=0)
-
-        if assert_grad_flag:
-            assert torch.allclose(local_param, all_param)
-            assert torch.allclose(local_param.grad, all_grad)
-        else:
-            local_param.data.copy_(all_param.data)
-
-
-def run_test(rank: int, world_size: int, port: int, num_experts: int, batch_size: int, dim: int, config: Dict):
-    assert batch_size % world_size == 0
-
-    colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-
-    MOE_MANAGER.__init__()
-    MOE_MANAGER.setup(parallel=None)
-    local_model = SparseMLP(num_experts=num_experts, hidden_size=dim, intermediate_size=dim * 2)
-    MOE_MANAGER.__init__()
-    MOE_MANAGER.setup(parallel="EP")
-    enable_hierarchical_comm = config.get("enable_hierarchical_comm", False)
-    if enable_hierarchical_comm:
-        os.environ["LOCAL_WORLD_SIZE"] = str(world_size)
-    ep_model = SparseMLP(
-        num_experts=num_experts,
-        hidden_size=dim,
-        intermediate_size=dim * 2,
-        enable_hierarchical_comm=enable_hierarchical_comm,
+from colossalai.booster.booster import Booster
+from colossalai.booster.plugin import HybridParallelPlugin
+from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
+from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.testing.random import seed_all
+from tests.test_moe.moe_utils import assert_loose_close
+
+NUM_BATCH = 4
+NUM_TOK_PER_BATCH, NUM_EXPERTS = 7, 4
+HIDDEN_SIZE_PER_HEAD = 4
+NUM_HEADS = 4
+TOP_K = 2
+
+
+@parameterize("stage", [1])
+@parameterize("ep_size", [2])
+def run_zero_with_original_model(stage: int, ep_size: int):
+    tp_size = dist.get_world_size() // ep_size
+    dtype = torch.bfloat16
+
+    rank = torch.distributed.get_rank()
+    torch.cuda.set_device(dist.get_rank())
+
+    seed_all(10086)
+
+    config = MixtralConfig(
+        hidden_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS,
+        intermediate_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS * 2,
+        num_hidden_layers=2,
+        num_attention_heads=NUM_HEADS,
+        num_key_value_heads=NUM_HEADS,
+        num_local_experts=NUM_EXPERTS,
+        num_experts_per_tok=TOP_K,
     )
-    MOE_MANAGER.__init__()
-    MOE_MANAGER.setup(parallel="TP")
-    tp_model = SparseMLP(num_experts=num_experts, hidden_size=dim, intermediate_size=dim * 2)
-    ep_model = ep_model.to(get_accelerator().get_current_device())
-    tp_model = tp_model.to(get_accelerator().get_current_device())
-    local_model = local_model.to(get_accelerator().get_current_device())
-
-    # sync ep param
-    sync_moe_model_param(ep_model)
-    dist_dict = MOE_MANAGER.parallel_info_dict
-    assert_equal_in_group(ep_model.experts.wi.data, dist_dict[world_size].dp_group)
-    assert_equal_in_group(ep_model.experts.wo.data, dist_dict[world_size].dp_group)
-    ep_grad_handler = MoeGradientHandler(ep_model)
-    # sync local param
-    sync_local_from_ep(local_model, ep_model)
-    # sync tp param
-    sync_tp_from_ep(tp_model, ep_model)
-    tp_grad_handler = MoeGradientHandler(tp_model)
-
-    rank = dist.get_rank()
-    input_data = torch.randn(batch_size, dim, device=get_accelerator().get_current_device())
-    micro_batch_size = batch_size // world_size
-    index = rank * micro_batch_size
-    # NOTE: ep & tp takes in sharded data for each process
-    shard_data = input_data.detach()[index : index + micro_batch_size]
-
-    out_local = local_model(input_data)
-    MOE_MANAGER.reset_loss()
-    out_tp = tp_model(shard_data)
-    MOE_MANAGER.reset_loss()
-    out_ep = ep_model(shard_data)
-    MOE_MANAGER.reset_loss()
-
-    assert torch.allclose(
-        out_tp, out_ep, atol=1e-6
-    ), f"Rank {rank} failed, max diff: {torch.max(torch.abs(out_tp - out_ep))}"
-    try:
-        out_local_slice = out_local[index : index + micro_batch_size]
-        assert torch.allclose(
-            out_ep, out_local_slice, atol=1e-6
-        ), f"Rank {rank} failed, max diff: {torch.max(torch.abs(out_ep - out_local_slice))}"
-    except AssertionError:
-        """
-        e.g., in local model, tokens = 4, capacity = 2, experts = 2, topk = 1
-            router yields [01] --> [0], [23] --> [1], this is valid as capacity is 2
-            However, in ep mode, there are 2 separate routers dealing with sharded data.
-            Assume router 0 handles token [01] and router 1 handles token [23].
-            Note that for each router the capacity is only 1 !!!
-            Thus, router 0 may yields [0] --> [0] or [1] --> [0], but not both.
-            The same thing happens on router 1. And finally some tokens are dropped due to the sharded nature.
-        """
-        warnings.warn(
-            "EP & TP may result in different behavior from local model. " "Please check the comments for details."
+    torch_model = MixtralModel(config).to(dtype).cuda()
+
+    zero_model = deepcopy(torch_model).to(dtype)
+    zero_optimizer = torch.optim.SGD(zero_model.parameters(), lr=1)
+    moe_booster = Booster(
+        plugin=MoeHybridParallelPlugin(
+            tp_size=tp_size,
+            moe_tp_size=tp_size,
+            pp_size=1,
+            ep_size=ep_size,
+            zero_stage=stage,
+            overlap_communication=False,
+            initial_scale=1,
         )
-
-    out_local.mean().backward()
-    out_tp.mean().backward()
-    tp_grad_handler.handle_gradient()
-    out_ep.mean().backward()
-    ep_grad_handler.handle_gradient()
-
-    assert_equal_in_group(ep_model.experts.wi.grad, dist_dict[world_size].dp_group)
-    assert_equal_in_group(ep_model.experts.wo.grad, dist_dict[world_size].dp_group)
-    sync_tp_from_ep(tp_model, ep_model, assert_grad_flag=True)
-    try:
-        sync_local_from_ep(local_model, ep_model, assert_grad_flag=True)
-    except AssertionError:
-        warnings.warn(
-            "EP & TP may result in different behavior from local model. " "Please check the comments for details."
+    )
+    zero_model, zero_optimizer, _, _, _ = moe_booster.boost(zero_model, zero_optimizer)
+
+    hybird_booster = Booster(
+        plugin=HybridParallelPlugin(
+            tp_size=tp_size,
+            pp_size=1,
+            zero_stage=stage,
+            overlap_communication=False,
+            initial_scale=1,
         )
+    )
+    hybrid_model, hybrid_optimizer, _, _, _ = hybird_booster.boost(
+        torch_model, torch.optim.SGD(torch_model.parameters(), lr=1)
+    )
+    # create different input
+    seed_all(1453 + rank)
+
+    hybrid_model.train()
+    zero_model.train()
+    for _ in range(2):
+        # zero-dp forward
+        input_data = torch.rand(
+            NUM_BATCH, NUM_TOK_PER_BATCH, HIDDEN_SIZE_PER_HEAD * NUM_HEADS, requires_grad=True
+        ).cuda()
+        zero_output = zero_model(inputs_embeds=input_data.to(dtype)).last_hidden_state.mean()
+        # zero-dp backward
+        zero_optimizer.backward(zero_output)
+        # torch-ddp forward
+        hybrid_output = hybrid_model(inputs_embeds=input_data.to(dtype)).last_hidden_state.mean()
+        assert_loose_close(zero_output, hybrid_output, dtype=dtype)
+        # torch-ddp backward
+        hybrid_optimizer.backward(hybrid_output)
+
+        # check grad
+        name_to_p = {n: p for n, p in hybrid_model.named_parameters()}
+        for n, p in zero_model.named_parameters():
+            zero_grad = zero_optimizer.get_param_grad(p)
+            if name_to_p[n].grad is None:
+                name_to_p[n].grad = torch.zeros_like(name_to_p[n])
+                continue
+            if zero_grad.shape != name_to_p[n].grad.shape:  # TODO check sharded and sliced moe
+                continue
+            assert_loose_close(zero_grad, name_to_p[n].grad, dtype=dtype, name=n)
+
+        # zero-dp step
+        zero_optimizer.step()
+
+        # original model step
+        hybrid_optimizer.step()
+
+        # check updated param
+        for n, p in zero_model.named_parameters():
+            if p.data.shape != name_to_p[n].data.shape:  # TODO check sharded and sliced moe
+                continue
+            assert_loose_close(p.data, name_to_p[n].data, dtype=dtype, name=n)
+
+    print(f"{dist.get_rank()} test passed")
+
+
+def run_dist(rank, world_size, port):
+    colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
+    run_zero_with_original_model()
 
 
-@pytest.mark.skip(reason="moe need to be refactored")
+@pytest.mark.skip("tested in corresponding sharderformer")
 @pytest.mark.dist
-@pytest.mark.parametrize("num_experts", [4, 64])
-@pytest.mark.parametrize("batch_size", [16])
-@pytest.mark.parametrize("dim", [64])
-@pytest.mark.parametrize(
-    "config",
-    [
-        {"enable_hierarchical_comm": False},
-        {"enable_hierarchical_comm": True},
-    ],
-)
+@pytest.mark.parametrize("world_size", [4])
 @rerun_if_address_is_in_use()
-def test_moe_ep_tp(num_experts: int, batch_size: int, dim: int, config: Dict):
-    spawn(run_test, 2, num_experts=num_experts, batch_size=batch_size, dim=dim, config=config)
+def test_moe_ep_tp(world_size):
+    spawn(run_dist, world_size)
 
 
 if __name__ == "__main__":
-    test_moe_ep_tp(num_experts=8, batch_size=32, dim=32)
+    test_moe_ep_tp(world_size=4)
diff --git a/tests/test_moe/test_moe_ep_zero.py b/tests/test_moe/test_moe_ep_zero.py
new file mode 100644
index 000000000000..2d4e638b638a
--- /dev/null
+++ b/tests/test_moe/test_moe_ep_zero.py
@@ -0,0 +1,119 @@
+from copy import deepcopy
+
+import pytest
+import torch
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+from transformers.models.mixtral.configuration_mixtral import MixtralConfig
+from transformers.models.mixtral.modeling_mixtral import MixtralModel
+
+import colossalai
+from colossalai.booster.booster import Booster
+from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
+from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.testing.random import seed_all
+from tests.test_moe.moe_utils import assert_loose_close
+
+NUM_BATCH = 4
+NUM_TOK_PER_BATCH, NUM_EXPERTS = 7, 4
+HIDDEN_SIZE_PER_HEAD = 4
+NUM_HEADS = 2
+TOP_K = 1
+
+
+@parameterize("stage", [1])
+@parameterize("ep_size", [2, 4])
+def run_zero_with_original_model(stage: int, ep_size: int):
+    dtype = torch.bfloat16
+
+    rank = torch.distributed.get_rank()
+    torch.cuda.set_device(dist.get_rank())
+
+    plugin = MoeHybridParallelPlugin(
+        pp_size=1, tp_size=1, ep_size=ep_size, zero_stage=stage, overlap_communication=False, initial_scale=1
+    )
+    booster = Booster(plugin=plugin)
+
+    seed_all(10086)
+
+    config = MixtralConfig(
+        hidden_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS,
+        intermediate_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS * 2,
+        num_hidden_layers=2,
+        num_attention_heads=NUM_HEADS,
+        num_key_value_heads=NUM_HEADS,
+        num_local_experts=NUM_EXPERTS,
+        num_experts_per_tok=TOP_K,
+    )
+
+    torch_model = MixtralModel(config).to(dtype).cuda()
+
+    zero_model = deepcopy(torch_model).to(dtype)
+    zero_optimizer = torch.optim.SGD(zero_model.parameters(), lr=1)
+
+    zero_model, zero_optimizer, _, _, _ = booster.boost(zero_model, zero_optimizer)
+
+    ddp_model = DDP(
+        torch_model.cuda(),
+        process_group=plugin.dp_group,
+        find_unused_parameters=True,  # important for torch ddp, not all experts are routed
+    ).cuda()
+    ddp_optimizer = torch.optim.SGD(ddp_model.parameters(), lr=1)
+
+    # create different input
+    seed_all(1453 + rank)
+
+    ddp_model.train()
+    zero_model.train()
+    for _ in range(2):
+        # zero-dp forward
+        input_data = torch.rand(
+            NUM_BATCH, NUM_TOK_PER_BATCH, HIDDEN_SIZE_PER_HEAD * NUM_HEADS, requires_grad=True
+        ).cuda()
+        zero_output = zero_model(inputs_embeds=input_data.to(dtype)).last_hidden_state.mean()
+        # zero-dp backward
+        zero_optimizer.backward(zero_output)
+
+        # torch-ddp forward
+        ddp_output = ddp_model(inputs_embeds=input_data.to(dtype)).last_hidden_state.mean()
+        assert_loose_close(zero_output, ddp_output, dtype=dtype)
+        # torch-ddp backward
+        ddp_output.backward()
+
+        # check grad
+        name_to_p = {n: p for n, p in ddp_model.named_parameters()}
+        for n, p in zero_model.named_parameters():
+            zero_grad = zero_optimizer.get_param_grad(p)
+            if name_to_p[n].grad is None:
+                name_to_p[n].grad = torch.zeros_like(name_to_p[n].data)
+                continue
+            assert_loose_close(zero_grad, name_to_p[n].grad, dtype=dtype, name=n)
+
+        # zero-dp step
+        zero_optimizer.step()
+
+        # original model step
+        ddp_optimizer.step()
+
+        # check updated param
+        for n, p in zero_model.named_parameters():
+            assert_loose_close(p.data, name_to_p[n].data, dtype=dtype, name=n)
+
+    print(f"{dist.get_rank()} test passed")
+
+
+def run_dist(rank, world_size, port):
+    colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
+    run_zero_with_original_model()
+
+
+@pytest.mark.skip("tested in corresponding sharderformer")
+@pytest.mark.dist
+@pytest.mark.parametrize("world_size", [4])
+@rerun_if_address_is_in_use()
+def test_moe_ep_zero(world_size):
+    spawn(run_dist, world_size)
+
+
+if __name__ == "__main__":
+    test_moe_ep_zero(world_size=4)
diff --git a/tests/test_moe/test_moe_zero_fwd_bwd_optim.py b/tests/test_moe/test_moe_zero_fwd_bwd_optim.py
deleted file mode 100644
index 042b3d8aedc5..000000000000
--- a/tests/test_moe/test_moe_zero_fwd_bwd_optim.py
+++ /dev/null
@@ -1,132 +0,0 @@
-from copy import deepcopy
-
-import pytest
-import torch
-import torch.distributed as dist
-from torch.nn.parallel import DistributedDataParallel as DDP
-from transformers.models.mixtral.configuration_mixtral import MixtralConfig
-from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
-
-import colossalai
-from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
-from colossalai.shardformer.modeling.mixtral import EPMixtralSparseMoeBlock
-from colossalai.tensor.moe_tensor.api import is_moe_tensor
-from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
-from colossalai.testing.random import seed_all
-from colossalai.zero import LowLevelZeroOptimizer
-from tests.test_moe.moe_utils import loose_close
-
-tokens, n_experts = 7, 4
-hidden_size = 8
-top_k = 2
-
-
-def split_grad(grad, world_size):
-    with torch.no_grad():
-        grad = grad.clone().detach().flatten()
-        padding_size = (world_size - grad.numel() % world_size) % world_size
-        if padding_size > 0:
-            grad = torch.nn.functional.pad(grad, [0, padding_size])
-        splited_grad = grad.split(grad.numel() // world_size)
-    return splited_grad
-
-
-@parameterize("dtype", [torch.float16, torch.bfloat16])
-@parameterize("master_weights", [True, False])
-@parameterize("stage", [1, 2])
-def run_zero_with_original_model(world_size, master_weights: bool, dtype: torch.dtype, stage: int):
-    rank = torch.distributed.get_rank()
-    torch.cuda.set_device(dist.get_rank())
-    plugin = MoeHybridParallelPlugin(
-        tp_size=1,
-        pp_size=1,
-        ep_size=dist.get_world_size() // 2,
-    )
-
-    seed_all(10086)
-    config = MixtralConfig(
-        hidden_size=hidden_size,
-        intermediate_size=hidden_size * 2,
-        num_local_experts=n_experts,
-        num_experts_per_tok=top_k,
-    )
-
-    orig_model = MixtralSparseMoeBlock(config).to(dtype).cuda()
-
-    ori_model = DDP(orig_model.cuda(), static_graph=True).cuda()
-
-    zero_model = deepcopy(orig_model).to(dtype)
-    zero_model = EPMixtralSparseMoeBlock.from_native_module(zero_model, ep_group=plugin.ep_group)
-
-    zero_optimizer = torch.optim.SGD(zero_model.parameters(), lr=1)
-    pg_param_list = {plugin.global_dp_group: [], plugin.moe_dp_group: []}
-    for p in zero_model.parameters():
-        if is_moe_tensor(p):
-            pg_param_list[plugin.moe_dp_group].append(p)
-        else:
-            pg_param_list[plugin.global_dp_group].append(p)
-
-    zero_optimizer = LowLevelZeroOptimizer(
-        zero_optimizer,
-        pg_to_param_list=pg_param_list,
-        master_weights=master_weights,
-        initial_scale=1,
-        overlap_communication=False,
-        partition_grad=True,
-    )
-
-    ori_optimizer = torch.optim.SGD(ori_model.parameters(), lr=1)
-
-    # create
-    seed_all(1453 + rank)
-
-    for _ in range(2):
-        # zero-dp forward
-        input_data = torch.rand(1, tokens, hidden_size).cuda()
-        zero_output, zero_logits = zero_model(input_data.to(dtype))
-
-        # torch-ddp forward
-        ori_output, ori_logits = ori_model(input_data.to(dtype))
-        loose_close(zero_output, ori_output, dtype=dtype)
-
-        # zero-dp backward
-        zero_optimizer.backward(zero_output.mean().float())
-
-        # torch-ddp backward
-        ori_output.mean().backward()
-
-        # check grad
-        name_to_p = {n: p for n, p in ori_model.module.named_parameters()}
-        for n, p in zero_model.named_parameters():
-            zero_grad = zero_optimizer.get_param_grad(p)
-            if name_to_p[n].grad is None:
-                assert zero_grad is None
-                continue
-
-            loose_close(zero_grad, name_to_p[n].grad, dtype=dtype)
-
-        # zero-dp step
-        zero_optimizer.step()
-
-        # original model step
-        ori_optimizer.step()
-
-        # check updated param
-        for n, p in zero_model.named_parameters():
-            loose_close(p.data, name_to_p[n].data, dtype=dtype)
-
-
-def run_dist(rank, world_size, port):
-    colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-    run_zero_with_original_model(world_size=world_size)
-
-
-@pytest.mark.dist
-@pytest.mark.parametrize("world_size", [2, 4])
-@rerun_if_address_is_in_use()
-def test_moe_zero_model(world_size):
-    spawn(run_dist, world_size)
-
-
-if __name__ == "__main__":
-    test_moe_zero_model(world_size=4)
diff --git a/tests/test_shardformer/test_model/_utils.py b/tests/test_shardformer/test_model/_utils.py
index 1ffcc541a854..190fee12931b 100644
--- a/tests/test_shardformer/test_model/_utils.py
+++ b/tests/test_shardformer/test_model/_utils.py
@@ -1,6 +1,6 @@
 import copy
 from contextlib import nullcontext
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Type
 
 import torch
 import torch.distributed as dist
@@ -117,7 +117,12 @@ def check_state_dict(org_model: Module, sharded_model: Module, name: str = ""):
 
 
 def build_model_from_hybrid_plugin(
-    model_fn: Callable, loss_fn: Callable, test_config: Dict[str, Any], optim_class=Adam, sharded_optim_class=Adam
+    model_fn: Callable,
+    loss_fn: Callable,
+    test_config: Dict[str, Any],
+    optim_class=Adam,
+    sharded_optim_class=Adam,
+    pluggin_cls: Type[HybridParallelPlugin] = HybridParallelPlugin,
 ):
     use_lazy_init = False
     if "use_lazy_init" in test_config:
@@ -149,9 +154,10 @@ def build_model_from_hybrid_plugin(
     else:
         org_optimizer = optim_class(org_model.parameters(), lr=1e-3)
         sharded_optimizer = sharded_optim_class(sharded_model.parameters(), lr=1e-3)
+
     criterion = loss_fn
 
-    plugin = HybridParallelPlugin(**test_config)
+    plugin = pluggin_cls(**test_config)
     booster = Booster(plugin=plugin)
 
     sharded_model, sharded_optimizer, criterion, _, _ = booster.boost(sharded_model, sharded_optimizer, criterion)
diff --git a/tests/test_shardformer/test_model/test_shard_chatglm2.py b/tests/test_shardformer/test_model/test_shard_chatglm2.py
index 6ce020b68ab5..92c077950ecc 100644
--- a/tests/test_shardformer/test_model/test_shard_chatglm2.py
+++ b/tests/test_shardformer/test_model/test_shard_chatglm2.py
@@ -136,6 +136,44 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
 @parameterize(
     "test_config",
     [
+        {  # Ulysess + Flash attention
+            "tp_size": 1,
+            "pp_size": 2,
+            "sp_size": 2,
+            "num_microbatches": 2,
+            "enable_sequence_parallelism": True,
+            "sequence_parallelism_mode": "all_to_all",
+            "enable_flash_attention": True,
+            "use_lazy_init": True,
+            "zero_stage": 1,
+            "precision": "fp16",
+            "initial_scale": 1,
+        },
+        {
+            "tp_size": 2,
+            "pp_size": 2,
+            "sp_size": 2,
+            "num_microbatches": 2,
+            "enable_sequence_parallelism": True,
+            "sequence_parallelism_mode": "split_gather",
+            "enable_flash_attention": True,
+            "use_lazy_init": True,
+            "zero_stage": 1,
+            "precision": "fp16",
+            "initial_scale": 1,
+        },
+        {
+            "tp_size": 1,
+            "pp_size": 1,
+            "sp_size": 2,
+            "num_microbatches": 1,
+            "enable_sequence_parallelism": True,
+            "sequence_parallelism_mode": "all_to_all",
+            "use_lazy_init": True,
+            "zero_stage": 1,
+            "precision": "fp16",
+            "initial_scale": 1,
+        },
         {
             "tp_size": 4,
             "pp_size": 1,
diff --git a/tests/test_shardformer/test_model/test_shard_command.py b/tests/test_shardformer/test_model/test_shard_command.py
index 4d66692a4c11..3281b50e1d5d 100644
--- a/tests/test_shardformer/test_model/test_shard_command.py
+++ b/tests/test_shardformer/test_model/test_shard_command.py
@@ -58,6 +58,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
     # Check the grad when using ZeRO-1 and ZeRO-2
     if (
         booster.plugin.zero_stage in [1, 2]
+        and booster.plugin.shard_config.pipeline_stage_manager is None
         and booster.plugin.shard_config.enable_sequence_parallelism
         and booster.plugin.shard_config.sequence_parallelism_mode == "all_to_all"
     ):
@@ -154,6 +155,45 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
 @parameterize(
     "test_config",
     [
+        {  # Ulysess + Flash attention
+            "tp_size": 1,
+            "pp_size": 2,
+            "sp_size": 2,
+            "num_microbatches": 2,
+            "enable_sequence_parallelism": True,
+            "sequence_parallelism_mode": "all_to_all",
+            "enable_flash_attention": True,
+            "use_lazy_init": True,
+            "zero_stage": 1,
+            "precision": "fp16",
+            "initial_scale": 1,
+        },
+        {
+            "tp_size": 2,
+            "pp_size": 2,
+            "sp_size": 2,
+            "num_microbatches": 2,
+            "enable_sequence_parallelism": True,
+            "sequence_parallelism_mode": "split_gather",
+            "enable_flash_attention": True,
+            "use_lazy_init": True,
+            "zero_stage": 1,
+            "precision": "fp16",
+            "initial_scale": 1,
+        },
+        {
+            "tp_size": 2,
+            "pp_size": 2,
+            "sp_size": 2,
+            "num_microbatches": 2,
+            "enable_sequence_parallelism": True,
+            "sequence_parallelism_mode": "ring",
+            "enable_flash_attention": True,
+            "use_lazy_init": True,
+            "zero_stage": 1,
+            "precision": "fp16",
+            "initial_scale": 1,
+        },
         {
             "tp_size": 2,
             "pp_size": 1,
diff --git a/tests/test_shardformer/test_model/test_shard_deepseek.py b/tests/test_shardformer/test_model/test_shard_deepseek.py
new file mode 100644
index 000000000000..46da4522fd9d
--- /dev/null
+++ b/tests/test_shardformer/test_model/test_shard_deepseek.py
@@ -0,0 +1,196 @@
+import os
+import shutil
+from copy import deepcopy
+from typing import Tuple
+
+import pytest
+import torch
+import torch.distributed
+import torch.distributed as dist
+from transformers import AutoConfig, AutoModel
+
+import colossalai
+from colossalai.booster.booster import Booster
+from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
+from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.testing.random import seed_all
+from tests.test_moe.moe_utils import assert_loose_close, check_model_equal
+
+NUM_BATCH = 8
+NUM_TOK_PER_BATCH, NUM_EXPERTS = 4, 2
+NUM_LAYERS = 4
+HIDDEN_SIZE_PER_HEAD = 4
+NUM_HEADS = 4
+TOP_K = 2
+
+
+CHECKED_CONFIG = [  # FOR_WORLD=4
+    (1, 4, 1, 1, 1),
+    (1, 1, 4, 1, 1),
+    (1, 1, 1, 4, 1),
+    (1, 1, 1, 1, 4),
+    (0, 1, 4, 1, 1),
+    (0, 1, 1, 4, 1),
+    (0, 1, 1, 1, 4),
+    (1, 2, 1, 1, 1),
+]
+
+
+@parameterize(
+    "config",
+    [
+        (1, 2, 2, 1, 1),
+        (1, 2, 1, 2, 1),
+        (1, 2, 1, 1, 2),
+    ],
+)
+def run_zero_with_original_model(config: Tuple[int, ...]):
+    stage, ep_size, pp_size, tp_size, sp_size = config
+    world_size = dist.get_world_size()
+    rank = dist.get_rank()
+    dtype, precision = torch.float16, "fp16"
+    torch.cuda.set_device(dist.get_rank())
+
+    plugin = MoeHybridParallelPlugin(
+        pp_size=pp_size,
+        num_microbatches=pp_size,
+        tp_size=tp_size,
+        sp_size=sp_size,
+        ep_size=ep_size,
+        zero_stage=stage,
+        enable_sequence_parallelism=sp_size > 1,
+        sequence_parallelism_mode="all_to_all" if sp_size > 1 else None,
+        enable_flash_attention=sp_size > 1,
+        overlap_communication=False,
+        initial_scale=1,
+        precision=precision,
+        find_unused_parameters=True,
+    )
+    dp_size = plugin.dp_size
+
+    booster = Booster(plugin=plugin)
+
+    assert pp_size <= NUM_LAYERS, "pp_size should be less than or equal to NUM_LAYERS"
+    config = AutoConfig.from_pretrained(
+        "deepseek-ai/deepseek-moe-16b-base",
+        hidden_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS,
+        intermediate_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS * 2,
+        moe_intermediate_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS * 2,
+        num_hidden_layers=4,
+        num_attention_heads=NUM_HEADS,
+        num_key_value_heads=NUM_HEADS,
+        first_k_dense_replace=1,
+        attn_implementation="flash_attention_2",
+        torch_dtype="float16",
+        n_routed_experts=NUM_EXPERTS,
+        num_experts_per_tok=TOP_K,
+        trust_remote_code=True,
+    )
+
+    # init model with the same seed
+    seed_all(10086)
+
+    torch_model = AutoModel.from_config(config, trust_remote_code=True).cuda().to(dtype)
+    torch_optimizer = torch.optim.SGD(torch_model.parameters(), lr=1)
+
+    parallel_model = deepcopy(torch_model)
+    parallel_optimizer = torch.optim.SGD(parallel_model.parameters(), lr=1)
+    parallel_model, parallel_optimizer, _, _, _ = booster.boost(parallel_model, parallel_optimizer)
+
+    # create different input along dp axis
+    seed_all(1453 + rank)
+
+    torch_model.train()
+    parallel_model.train()
+    for _ in range(2):
+        # gen random input
+        input_embeddings = torch.rand(
+            NUM_BATCH, NUM_TOK_PER_BATCH, HIDDEN_SIZE_PER_HEAD * NUM_HEADS, requires_grad=True
+        ).cuda()
+        dist.all_reduce(
+            input_embeddings, group=plugin.pp_group
+        )  # pp inputs except the first stage doesn't matter, but need to be replicate for torch model check
+
+        dist.all_reduce(input_embeddings, group=plugin.tp_group)  # tp group duplicate input
+        dist.all_reduce(input_embeddings, group=plugin.sp_group)  # sp group duplicate input
+
+        # run the model with hybrid parallel
+        if booster.plugin.stage_manager is not None:
+            # for test with pp
+            data_iter = iter([{"inputs_embeds": input_embeddings}])
+            sharded_output = booster.execute_pipeline(
+                data_iter,
+                parallel_model,
+                lambda x, y: x[0].mean(),
+                parallel_optimizer,
+                return_loss=True,
+                return_outputs=True,
+            )
+            if booster.plugin.stage_manager.is_last_stage():
+                parallel_output = sharded_output["loss"]
+            else:
+                parallel_output = torch.tensor(12345.0, device="cuda")
+
+            # broadcast along pp axis
+            dist.broadcast(
+                parallel_output, src=dist.get_process_group_ranks(plugin.pp_group)[-1], group=plugin.pp_group
+            )
+        else:
+            # for test without pp
+            parallel_output = parallel_model(inputs_embeds=input_embeddings.to(dtype)).last_hidden_state.mean()
+            parallel_optimizer.backward(parallel_output)
+        parallel_optimizer.step()
+        parallel_optimizer.zero_grad()
+        dist.all_reduce(parallel_output, group=plugin.dp_group)
+
+        # ===================================================================================
+        # run normal model with all dp(different) inputs
+        all_inputs = [torch.empty_like(input_embeddings) for _ in range(dp_size)]
+        dist.all_gather(all_inputs, input_embeddings, group=plugin.dp_group)
+        torch_output_sum = 0
+        for input_data_ in all_inputs:
+            torch_output = torch_model(inputs_embeds=input_data_.to(dtype)).last_hidden_state.mean()
+            torch_output.backward()
+            torch_output_sum += torch_output.detach()
+        # avg dp grads follows zero optimizer
+        for p in torch_model.parameters():
+            if p.grad is not None:
+                p.grad /= dp_size
+        torch_optimizer.step()
+        torch_optimizer.zero_grad()
+
+        assert_loose_close(parallel_output, torch_output_sum, dtype=dtype)
+
+    # use checkpoint to load sharded zero model
+    model_dir = "./test_deepseek"
+    if rank == world_size - 1:
+        os.makedirs(model_dir, exist_ok=True)
+
+    dist.barrier()
+    booster.save_model(parallel_model, model_dir, shard=True)
+    dist.barrier()
+
+    saved_model = AutoModel.from_pretrained(model_dir, trust_remote_code=True).cuda()
+    check_model_equal(torch_model, saved_model)
+    dist.barrier()
+
+    if rank == world_size - 1:
+        shutil.rmtree(model_dir)
+
+    print(f"rank {dist.get_rank()} test passed")
+
+
+def run_dist(rank, world_size, port):
+    colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
+    run_zero_with_original_model()
+
+
+@pytest.mark.dist
+@pytest.mark.parametrize("world_size", [4])
+@rerun_if_address_is_in_use()
+def test_deepseek(world_size):
+    spawn(run_dist, world_size)
+
+
+if __name__ == "__main__":
+    test_deepseek(world_size=4)
diff --git a/tests/test_shardformer/test_model/test_shard_llama.py b/tests/test_shardformer/test_model/test_shard_llama.py
index 8fe18f69bcd1..88e54176b9fd 100644
--- a/tests/test_shardformer/test_model/test_shard_llama.py
+++ b/tests/test_shardformer/test_model/test_shard_llama.py
@@ -59,10 +59,12 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
     if (
         booster.plugin.zero_stage in [1, 2]
         and booster.plugin.shard_config.enable_sequence_parallelism
+        and booster.plugin.shard_config.pipeline_stage_manager is None
         and booster.plugin.shard_config.sequence_parallelism_mode == "all_to_all"
     ):
+        master2working = sharded_optimizer.get_master_to_working_map()
         for p1, p2 in zip(llama_model.parameters(), sharded_optimizer._master_param_groups_of_current_rank[0]):
-            working_p = sharded_optimizer.master_to_working_param[id(p2)]
+            working_p = master2working[id(p2)]
             grads = sharded_optimizer.get_partitioned_gradients_by_param_id(0, id(working_p))
             grad_index = (
                 0
@@ -146,29 +148,29 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
 @parameterize(
     "test_config",
     [
-        {  # Test ring + Flash attention
-            "tp_size": 2,
-            "pp_size": 1,
+        {  # Ulysess + Flash attention
+            "tp_size": 1,
+            "pp_size": 2,
             "sp_size": 2,
-            "num_microbatches": 1,
+            "num_microbatches": 2,
             "enable_sequence_parallelism": True,
-            "sequence_parallelism_mode": "ring",
+            "sequence_parallelism_mode": "all_to_all",
             "enable_flash_attention": True,
             "use_lazy_init": True,
-            "zero_stage": 2,
+            "zero_stage": 0,
             "precision": "fp16",
             "initial_scale": 1,
         },
-        {  # Ulysess + Flash attention
-            "tp_size": 1,
-            "pp_size": 2,
+        {  # Test ring + Flash attention
+            "tp_size": 2,
+            "pp_size": 1,
             "sp_size": 2,
-            "num_microbatches": 2,
+            "num_microbatches": 1,
             "enable_sequence_parallelism": True,
-            "sequence_parallelism_mode": "all_to_all",
+            "sequence_parallelism_mode": "ring",
             "enable_flash_attention": True,
             "use_lazy_init": True,
-            "zero_stage": 1,
+            "zero_stage": 2,
             "precision": "fp16",
             "initial_scale": 1,
         },
@@ -245,7 +247,6 @@ def run_llama_test(test_config):
         except Exception as e:
             print(f"Failed config: {test_config}")
             raise e
-
     clear_layout_converter()
     Randomizer.reset_index()
     torch.cuda.empty_cache()
diff --git a/tests/test_shardformer/test_model/test_shard_mixtral.py b/tests/test_shardformer/test_model/test_shard_mixtral.py
new file mode 100644
index 000000000000..de09eedcbed5
--- /dev/null
+++ b/tests/test_shardformer/test_model/test_shard_mixtral.py
@@ -0,0 +1,190 @@
+import os
+import shutil
+from copy import deepcopy
+from typing import Tuple
+
+import pytest
+import torch
+import torch.distributed
+import torch.distributed as dist
+from transformers.models.mixtral.configuration_mixtral import MixtralConfig
+from transformers.models.mixtral.modeling_mixtral import MixtralModel
+
+import colossalai
+from colossalai.booster.booster import Booster
+from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
+from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.testing.random import seed_all
+from tests.test_moe.moe_utils import assert_loose_close, check_model_equal
+
+NUM_BATCH = 8
+NUM_TOK_PER_BATCH, NUM_EXPERTS = 4, 4
+NUM_LAYERS = 4
+HIDDEN_SIZE_PER_HEAD = 4
+NUM_HEADS = 4
+TOP_K = 1
+
+CHECKED_CONFIG = [  # FOR WORLD=4
+    (0, 1, 4, 1, 1),
+    (0, 1, 1, 4, 1),
+    (0, 1, 1, 1, 4),
+    (1, 4, 1, 1, 1),
+    (1, 1, 4, 1, 1),
+    (1, 1, 1, 4, 1),
+    (1, 1, 1, 1, 4),
+    (1, 2, 1, 1, 1),
+]
+
+
+@parameterize(
+    "config",
+    [
+        (1, 2, 2, 1, 1),
+        (1, 2, 1, 2, 1),
+        (1, 2, 1, 1, 2),
+    ],
+)
+def run_zero_with_original_model(config: Tuple[int, ...]):
+    stage, ep_size, pp_size, tp_size, sp_size = config
+    world_size = dist.get_world_size()
+    rank = dist.get_rank()
+    dtype, precision = torch.float16, "fp16"
+    torch.cuda.set_device(dist.get_rank())
+
+    plugin = MoeHybridParallelPlugin(
+        pp_size=pp_size,
+        num_microbatches=pp_size,
+        tp_size=tp_size,
+        sp_size=sp_size,
+        ep_size=ep_size,
+        zero_stage=stage,
+        enable_sequence_parallelism=sp_size > 1,
+        sequence_parallelism_mode="all_to_all" if sp_size > 1 else None,
+        overlap_communication=False,
+        initial_scale=1,
+        precision=precision,
+        find_unused_parameters=True,
+    )
+    dp_size = plugin.dp_size
+
+    booster = Booster(plugin=plugin)
+
+    assert pp_size <= NUM_LAYERS, "pp_size should be less than or equal to NUM_LAYERS"
+    config = MixtralConfig(
+        hidden_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS,
+        intermediate_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS * 2,
+        num_hidden_layers=NUM_LAYERS,
+        num_attention_heads=NUM_HEADS,
+        num_key_value_heads=NUM_HEADS,
+        num_local_experts=NUM_EXPERTS,
+        num_experts_per_tok=TOP_K,
+        attn_implementation="flash_attention_2",
+    )
+
+    # init model with the same seed
+    seed_all(10086)
+
+    torch_model = MixtralModel(config).to(dtype).cuda()
+    torch_optimizer = torch.optim.SGD(torch_model.parameters(), lr=1)
+
+    parallel_model = deepcopy(torch_model)
+    parallel_optimizer = torch.optim.SGD(parallel_model.parameters(), lr=1)
+    parallel_model, parallel_optimizer, _, _, _ = booster.boost(parallel_model, parallel_optimizer)
+
+    # create different input along dp axis
+    seed_all(1453 + rank)
+
+    torch_model.train()
+    parallel_model.train()
+    for _ in range(2):
+        # gen random input
+        input_embeddings = torch.rand(
+            NUM_BATCH, NUM_TOK_PER_BATCH, HIDDEN_SIZE_PER_HEAD * NUM_HEADS, requires_grad=True
+        ).cuda()
+        dist.all_reduce(
+            input_embeddings, group=plugin.pp_group
+        )  # pp inputs except the first stage doesn't matter, but need to be replicate for torch model check
+
+        dist.all_reduce(input_embeddings, group=plugin.tp_group)  # tp group duplicate input
+        dist.all_reduce(input_embeddings, group=plugin.sp_group)  # sp group duplicate input
+
+        # run the model with hybrid parallel
+        if booster.plugin.stage_manager is not None:
+            # for test with pp
+            data_iter = iter([{"inputs_embeds": input_embeddings}])
+            sharded_output = booster.execute_pipeline(
+                data_iter,
+                parallel_model,
+                lambda x, y: x.last_hidden_state.mean(),
+                parallel_optimizer,
+                return_loss=True,
+                return_outputs=True,
+            )
+            if booster.plugin.stage_manager.is_last_stage():
+                parallel_output = sharded_output["loss"]
+            else:
+                parallel_output = torch.tensor(12345.0, device="cuda")
+
+            # broadcast along pp axis
+            dist.broadcast(
+                parallel_output, src=dist.get_process_group_ranks(plugin.pp_group)[-1], group=plugin.pp_group
+            )
+        else:
+            # for test without pp
+            parallel_output = parallel_model(inputs_embeds=input_embeddings.to(dtype)).last_hidden_state.mean()
+            parallel_optimizer.backward(parallel_output)
+        parallel_optimizer.step()
+        parallel_optimizer.zero_grad()
+        dist.all_reduce(parallel_output, group=plugin.dp_group)
+
+        # ===================================================================================
+        # run normal model with all dp(different) inputs
+        all_inputs = [torch.empty_like(input_embeddings) for _ in range(dp_size)]
+        dist.all_gather(all_inputs, input_embeddings, group=plugin.dp_group)
+        torch_output_sum = 0
+        for input_data_ in all_inputs:
+            torch_output = torch_model(inputs_embeds=input_data_.to(dtype)).last_hidden_state.mean()
+            torch_output.backward()
+            torch_output_sum += torch_output.detach()
+        # avg dp grads follows zero optimizer
+        for p in torch_model.parameters():
+            if p.grad is not None:
+                p.grad /= dp_size
+        torch_optimizer.step()
+        torch_optimizer.zero_grad()
+
+        assert_loose_close(parallel_output, torch_output_sum, dtype=dtype)
+
+    # use checkpoint to load sharded zero model
+    model_dir = "./test_mixtral"
+    if rank == world_size - 1:
+        os.makedirs(model_dir, exist_ok=True)
+
+    dist.barrier()
+    booster.save_model(parallel_model, model_dir, shard=True)
+    dist.barrier()
+
+    saved_model = MixtralModel.from_pretrained(model_dir).cuda().to(dtype)
+    check_model_equal(torch_model, saved_model)
+    dist.barrier()
+
+    if rank == world_size - 1:
+        shutil.rmtree(model_dir)
+
+    print(f"rank {dist.get_rank()} test passed")
+
+
+def run_dist(rank, world_size, port):
+    colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
+    run_zero_with_original_model()
+
+
+@pytest.mark.dist
+@pytest.mark.parametrize("world_size", [4])
+@rerun_if_address_is_in_use()
+def test_mixtral(world_size):
+    spawn(run_dist, world_size)
+
+
+if __name__ == "__main__":
+    test_mixtral(world_size=4)
diff --git a/tests/test_shardformer/test_model/test_shard_qwen2.py b/tests/test_shardformer/test_model/test_shard_qwen2.py
index 166b31df967e..c87415b7562d 100644
--- a/tests/test_shardformer/test_model/test_shard_qwen2.py
+++ b/tests/test_shardformer/test_model/test_shard_qwen2.py
@@ -135,6 +135,68 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "precision": "fp16",
             "initial_scale": 1,
         },
+        {  # Ulysess + Flash attention
+            "tp_size": 1,
+            "pp_size": 2,
+            "sp_size": 2,
+            "num_microbatches": 2,
+            "enable_sequence_parallelism": True,
+            "sequence_parallelism_mode": "all_to_all",
+            "enable_flash_attention": True,
+            "use_lazy_init": True,
+            "zero_stage": 1,
+            "precision": "fp16",
+            "initial_scale": 1,
+        },
+        {
+            "tp_size": 2,
+            "pp_size": 2,
+            "sp_size": 2,
+            "num_microbatches": 2,
+            "enable_sequence_parallelism": True,
+            "sequence_parallelism_mode": "split_gather",
+            "enable_flash_attention": True,
+            "use_lazy_init": True,
+            "zero_stage": 1,
+            "precision": "fp16",
+            "initial_scale": 1,
+        },
+        {
+            "tp_size": 2,
+            "pp_size": 2,
+            "sp_size": 2,
+            "num_microbatches": 2,
+            "enable_sequence_parallelism": True,
+            "sequence_parallelism_mode": "ring",
+            "enable_flash_attention": True,
+            "use_lazy_init": True,
+            "zero_stage": 1,
+            "precision": "fp16",
+            "initial_scale": 1,
+        },
+        {
+            "tp_size": 1,
+            "pp_size": 1,
+            "sp_size": 2,
+            "num_microbatches": 1,
+            "enable_sequence_parallelism": True,
+            "sequence_parallelism_mode": "all_to_all",
+            "use_lazy_init": True,
+            "zero_stage": 1,
+            "precision": "fp16",
+            "initial_scale": 1,
+        },
+        {
+            "tp_size": 4,
+            "pp_size": 1,
+            "num_microbatches": 1,
+            "enable_sequence_parallelism": True,
+            "sequence_parallelism_mode": "split_gather",
+            "enable_flash_attention": False,
+            "use_lazy_init": True,
+            "precision": "fp16",
+            "initial_scale": 1,
+        },
         {
             "tp_size": 1,
             "pp_size": 2,
@@ -151,8 +213,11 @@ def run_qwen2_test(test_config):
     sub_model_zoo = model_zoo.get_sub_registry("transformers_qwen2")
 
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
-        check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
-
+        try:
+            check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
+        except Exception as e:
+            print(f"Failed config: {test_config}")
+            raise e
     clear_layout_converter()
     Randomizer.reset_index()
     torch.cuda.empty_cache()
@@ -197,7 +262,11 @@ def run_qwen2_3d_test(test_config):
     sub_model_zoo = model_zoo.get_sub_registry("transformers_qwen2")
 
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
-        check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
+        try:
+            check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
+        except Exception as e:
+            print(f"Failed config: {test_config}")
+            raise e
 
     clear_layout_converter()
     Randomizer.reset_index()
diff --git a/tests/test_zero/test_low_level/test_grad_acc.py b/tests/test_zero/test_low_level/test_grad_acc.py
index ed12bb72dc3e..94db70ca50f7 100644
--- a/tests/test_zero/test_low_level/test_grad_acc.py
+++ b/tests/test_zero/test_low_level/test_grad_acc.py
@@ -64,8 +64,12 @@ def fwd_bwd_func(number, cur_data, check_flag):
     zero1_optimizer.step()
     zero2_optimizer.step()
 
+    zero1_optimizer._force_wait_all_gather()
+    zero2_optimizer._force_wait_all_gather()
+
     # check updated param
     for z1p, z2p in zip(zero1_model.parameters(), zero2_model.parameters()):
+        assert not hasattr(z1p, "_all_gather_handle")
         assert torch.equal(z1p.data, z2p.data)
 
 
diff --git a/tests/test_zero/test_low_level/test_zero1_2.py b/tests/test_zero/test_low_level/test_zero1_2.py
index cda51c4ef435..368c782fe2c4 100644
--- a/tests/test_zero/test_low_level/test_zero1_2.py
+++ b/tests/test_zero/test_low_level/test_zero1_2.py
@@ -190,6 +190,8 @@ def exam_zero_1_torch_ddp(world_size, dtype: torch.dtype, master_weights: bool):
         # torch ddp step
         torch_optimizer.step()
 
+        zero_optimizer._force_wait_all_gather()
+
         # check updated param
         for (n, p), z1p in zip(torch_model.named_parameters(), zero_model.parameters()):
             loose_close(p, z1p, dtype=dtype)
diff --git a/version.txt b/version.txt
index 1d0ba9ea182b..2b7c5ae01848 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.4.0
+0.4.2

From afb26de873bd5ac9238b810ea29cd5416c9d9c03 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Tue, 6 Aug 2024 16:58:23 +0800
Subject: [PATCH 017/167] [fp8]support all2all fp8 (#5953)

* support all2all fp8

* fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix

* fix

* fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 colossalai/quantization/fp8.py           | 56 ++++++++++++++++++++
 tests/test_fp8/test_all_to_all_single.py | 67 ++++++++++++++++++++++++
 2 files changed, 123 insertions(+)
 create mode 100644 tests/test_fp8/test_all_to_all_single.py

diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index b003e90e89f8..0fb7c4b5db50 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -115,6 +115,62 @@ def all_reduce_fp8(tensor: torch.Tensor, fp8_format="e4m3", op=ReduceOp.SUM, gro
     tensor.copy_(out[:input_size].view(input_shape).to(input_type))
 
 
+def all_to_all_single_fp8(
+    output, input, output_split_sizes=None, input_split_sizes=None, fp8_format="e5m2", group=None, async_op=False
+) -> None:
+    r"""
+    This is an in-place operation for compressed all_reduce using fp8.
+    It works like dist.all_to_all_single but during communication the data is cast to fp8 format.
+    Args:
+        tensor: torch.Tensor in fp32, fp16, bf16 datatype.
+        fp8_format: e4m3 or e5m2
+    Returns:
+        None
+    """
+    world_size = dist.get_world_size(group=group)
+    input_type = input.dtype
+    input_shape = input.shape
+    input_device = input.device
+    input = input.flatten()
+
+    fp8_type = torch.float8_e4m3fn if fp8_format == "e4m3" else torch.float8_e5m2
+
+    ret, scale = cast_to_fp8(input, fp8_format=fp8_format)
+
+    inp = ret.view(torch.uint8)
+    if input_split_sizes is not None:
+        input_split_sizes = [input_split_sizes[i] * np.prod(input_shape[1:]) for i in range(world_size)]
+        input_chunks = list(torch.split(inp, input_split_sizes))
+    else:
+        input_chunks = list(torch.chunk(inp, world_size, dim=0))
+
+    if output_split_sizes is not None:
+        output_chunks = [
+            torch.empty((output_split_sizes[i] * np.prod(input_shape[1:]),), device=input_device, dtype=inp.dtype)
+            for i in range(world_size)
+        ]
+    else:
+        if dist.get_rank() == world_size - 1:
+            output_chunks = [torch.empty_like(input_chunks[-1]) for _ in range(world_size)]
+        else:
+            output_chunks = [torch.empty_like(input_chunks[0]) for _ in range(world_size)]
+
+    dist.all_to_all(output_chunks, input_chunks, group=group)
+    scale_list = [torch.ones(1, dtype=scale.dtype, device=input_device) for _ in range(world_size)]
+    dist.all_gather(scale_list, scale, group=group)
+    cast_output_chunk = [
+        cast_from_fp8(out.view(fp8_type), scale, input_type) for scale, out in zip(scale_list, output_chunks)
+    ]
+
+    tensor_out = torch.cat(cast_output_chunk, dim=0)
+    outputs_shape = list(input_shape)
+    if output_split_sizes is not None:
+        outputs_shape[0] = sum(output_split_sizes)
+    else:
+        outputs_shape = input_shape
+    output.data = tensor_out.view(outputs_shape).to(input_type)
+
+
 def cast_to_fp8_pipeline(inp: Any) -> None:
     """
     Cast the hidden_states tensor of inp object to fp8 format before p2p communication in pipeline.
diff --git a/tests/test_fp8/test_all_to_all_single.py b/tests/test_fp8/test_all_to_all_single.py
new file mode 100644
index 000000000000..88becd3f07fc
--- /dev/null
+++ b/tests/test_fp8/test_all_to_all_single.py
@@ -0,0 +1,67 @@
+import torch
+import torch.distributed as dist
+from torch.distributed.distributed_c10d import _get_default_group
+from torch.testing import assert_close
+
+from colossalai import launch
+from colossalai.accelerator import get_accelerator
+from colossalai.quantization.fp8 import all_to_all_single_fp8
+from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+
+
+@parameterize("shape", [(4,), (1, 8, 16), (4, 8, 16)])
+@parameterize("dtype", [torch.bfloat16])
+def check_all2all(shape, dtype):
+    x = torch.rand(shape, dtype=dtype, device=get_accelerator().get_current_device())
+    output = torch.empty_like(x)
+    output_fp8 = torch.empty_like(x)
+    dist.all_to_all_single(output, x, group=_get_default_group(), async_op=False)
+    all_to_all_single_fp8(output_fp8, x, group=_get_default_group(), async_op=False)
+    assert_close(output, output_fp8, rtol=0.1, atol=0.1)
+
+
+@parameterize("shape", [(8, 8, 16)])
+@parameterize("dtype", [torch.bfloat16, torch.float16])
+def check_all2all_uneven(shape, dtype):
+    x = torch.rand(shape, dtype=dtype, device=get_accelerator().get_current_device())
+    input_split_sizes = [3, 3, 1, 1]
+    if dist.get_rank() in [0, 1]:
+        output_split_sizes = [3, 3, 3, 3]
+    else:
+        output_split_sizes = [1, 1, 1, 1]
+    output_shape = list(shape)
+    output_shape[0] = sum(output_split_sizes)
+    output = torch.empty(output_shape, device=x.device, dtype=x.dtype)
+    output_fp8 = torch.empty(output_shape, device=x.device, dtype=x.dtype)
+    dist.all_to_all_single(
+        output,
+        x,
+        output_split_sizes=output_split_sizes,
+        input_split_sizes=input_split_sizes,
+        group=_get_default_group(),
+        async_op=False,
+    )
+    all_to_all_single_fp8(
+        output_fp8,
+        x,
+        output_split_sizes=output_split_sizes,
+        input_split_sizes=input_split_sizes,
+        group=_get_default_group(),
+        async_op=False,
+    )
+    assert_close(output, output_fp8, rtol=0.1, atol=0.1)
+
+
+def run_dist(rank, world_size, port):
+    launch(rank=rank, world_size=world_size, port=port, host="localhost")
+    check_all2all()
+    check_all2all_uneven()
+
+
+@rerun_if_address_is_in_use()
+def test_all_to_all_single():
+    spawn(run_dist, 4)
+
+
+if __name__ == "__main__":
+    test_all_to_all_single()

From 76ea16466fc24a5a8895ddc00d6df06a270c4f40 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Wed, 7 Aug 2024 15:41:49 +0800
Subject: [PATCH 018/167] [fp8] add fp8 linear (#5967)

* [fp8] add fp8 linear

* [test] fix fp8 linear test condition

* [test] fix fp8 linear test condition

* [test] fix fp8 linear test condition
---
 colossalai/quantization/fp8.py    | 61 ++++++++++++++++++++++++++++++-
 tests/test_fp8/test_fp8_linear.py | 45 +++++++++++++++++++++++
 2 files changed, 105 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_fp8/test_fp8_linear.py

diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index 0fb7c4b5db50..bc8c3ced4cdd 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -1,4 +1,4 @@
-from typing import Any
+from typing import Any, Optional
 
 import numpy as np
 import torch
@@ -415,3 +415,62 @@ def gather_fp8(output_list, input_, group=None, fp8_format="e5m2"):
         output = tensor_list[i].view(fp8_type)
         scale = scale_list[i]
         output_list[i].copy_(cast_from_fp8(output, scale, input_type))
+
+
+class _LinearFp8(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        w: torch.Tensor,
+        bias: Optional[torch.Tensor],
+    ) -> Any:
+        assert (
+            x.dtype in (torch.bfloat16, torch.float16) and x.dtype == w.dtype
+        ), "Only float16 and bfloat16 are allowed."
+        if bias is not None:
+            assert bias.dtype == x.dtype, "Bias should have the same dtype as input."
+        # ensure x and w are row-major
+        assert x.is_contiguous() and w.is_contiguous(), "Input and weight should be contiguous."
+        ctx.x_shape = x.shape
+        ctx.has_bias = bias is not None
+        ctx.out_dtype = x.dtype
+        x = x.reshape(-1, x.shape[-1])
+
+        x_fp8, inv_scale_x = cast_to_fp8(x, fp8_format="e4m3")
+        w_fp8, inv_scale_w = cast_to_fp8(w, fp8_format="e4m3")
+        ctx.x_fp8 = x_fp8
+        ctx.w_fp8_t = w_fp8.t()
+        ctx.inv_scale_x = inv_scale_x
+        ctx.inv_scale_w = inv_scale_w
+        out = torch._scaled_mm(
+            x_fp8, ctx.w_fp8_t, bias=bias, out_dtype=ctx.out_dtype, scale_a=inv_scale_x, scale_b=inv_scale_w
+        )[0]
+        return out.reshape(*ctx.x_shape[:-1], w.shape[0])
+
+    @staticmethod
+    def backward(ctx: Any, out_grad) -> Any:
+        out_grad = out_grad.reshape(-1, out_grad.shape[-1])
+        out_grad_fp8, out_grad_scale = cast_to_fp8(out_grad, fp8_format="e5m2")
+        x_grad = torch._scaled_mm(
+            out_grad_fp8,
+            ctx.w_fp8_t.contiguous().t(),
+            out_dtype=ctx.out_dtype,
+            scale_a=out_grad_scale,
+            scale_b=ctx.inv_scale_w,
+        )[0]
+        w_grad = torch._scaled_mm(
+            out_grad_fp8.t().contiguous(),
+            ctx.x_fp8.t().contiguous().t(),
+            out_dtype=ctx.out_dtype,
+            scale_a=out_grad_scale,
+            scale_b=ctx.inv_scale_x,
+        )[0]
+        bias_grad = None
+        if ctx.has_bias:
+            bias_grad = out_grad.sum(0)
+        return x_grad.reshape(ctx.x_shape), w_grad, bias_grad
+
+
+def linear_fp8(x: torch.Tensor, w: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    return _LinearFp8.apply(x, w, bias)
diff --git a/tests/test_fp8/test_fp8_linear.py b/tests/test_fp8/test_fp8_linear.py
new file mode 100644
index 000000000000..d035957f2a31
--- /dev/null
+++ b/tests/test_fp8/test_fp8_linear.py
@@ -0,0 +1,45 @@
+import pytest
+import torch
+import torch.nn.functional as F
+from torch.testing import assert_close
+
+from colossalai.accelerator import get_accelerator
+from colossalai.quantization.fp8 import linear_fp8
+from colossalai.utils import get_current_device
+
+D_IN, D_OUT = 16, 32
+B, S = 2, 64
+DTYPE = torch.bfloat16
+
+
+@pytest.mark.skipif(get_accelerator().get_device_capability()[0] < 9, reason="Test requires device capability >= 9.0")
+@pytest.mark.parametrize("use_bias", [True, False])
+@pytest.mark.parametrize("use_batch", [True, False])
+def test_fp8_linear(use_bias: bool, use_batch: bool):
+    # create tensors
+    w = torch.rand(D_OUT, D_IN, device=get_current_device(), dtype=DTYPE, requires_grad=True)
+    ref_w = w.clone().detach().requires_grad_()
+    if use_batch:
+        x_shape = (B, S, D_IN)
+    else:
+        x_shape = (S, D_IN)
+    x = torch.rand(x_shape, device=get_current_device(), dtype=DTYPE, requires_grad=True)
+    ref_x = x.clone().detach().requires_grad_()
+    if use_bias:
+        bias = torch.rand(D_OUT, device=get_current_device(), dtype=DTYPE, requires_grad=True)
+        ref_bias = bias.clone().detach().requires_grad_()
+    else:
+        bias = None
+        ref_bias = None
+
+    out = linear_fp8(x, w, bias)
+    assert out.shape == x_shape[:-1] + (D_OUT,)
+    out.sum().backward()
+    ref_out = F.linear(ref_x, ref_w, ref_bias)
+    ref_out.sum().backward()
+
+    assert_close(out, ref_out, rtol=0.2, atol=0.1)
+    assert_close(x.grad, ref_x.grad, rtol=0.2, atol=0.1)
+    assert_close(w.grad, ref_w.grad, rtol=0.2, atol=0.1)
+    if use_bias:
+        assert_close(bias.grad, ref_bias.grad, rtol=0.2, atol=0.1)

From ccabcf64858f98d580d9ff9e704f3bf6de57effc Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Wed, 7 Aug 2024 18:21:08 +0800
Subject: [PATCH 019/167] [fp8] support fp8 amp for hybrid parallel plugin
 (#5975)

* [fp8] support fp8 amp for hybrid parallel plugin

* [test] add fp8 hook test

* [fp8] fix fp8 linear compatibility
---
 colossalai/booster/plugin/fp8_hook.py         | 23 +++++++++
 .../booster/plugin/hybrid_parallel_plugin.py  | 18 ++++++-
 colossalai/quantization/fp8.py                |  3 +-
 colossalai/tensor/colo_parameter.py           |  2 +
 colossalai/tensor/param_op_hook.py            |  9 ++++
 tests/test_fp8/test_fp8_hook.py               | 50 +++++++++++++++++++
 6 files changed, 102 insertions(+), 3 deletions(-)
 create mode 100644 colossalai/booster/plugin/fp8_hook.py
 create mode 100644 tests/test_fp8/test_fp8_hook.py

diff --git a/colossalai/booster/plugin/fp8_hook.py b/colossalai/booster/plugin/fp8_hook.py
new file mode 100644
index 000000000000..6171dd755a9d
--- /dev/null
+++ b/colossalai/booster/plugin/fp8_hook.py
@@ -0,0 +1,23 @@
+import torch.nn.functional as F
+
+from colossalai.quantization.fp8 import linear_fp8
+from colossalai.tensor.param_op_hook import ColoParamOpHook
+
+
+class FP8Hook(ColoParamOpHook):
+    def pre_forward(self, params) -> None:
+        pass
+
+    def post_forward(self, params) -> None:
+        pass
+
+    def pre_backward(self, params) -> None:
+        pass
+
+    def post_backward(self, params) -> None:
+        pass
+
+    def rewrite_op(self, func):
+        if func is F.linear:
+            return linear_fp8
+        return func
diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index bf0788650811..f585abdb5d04 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -40,6 +40,7 @@
 from colossalai.zero.low_level import LowLevelZeroOptimizer
 from colossalai.zero.low_level.zero_hook import ZeroOpHook, wait_all_gather_handle
 
+from .fp8_hook import FP8Hook
 from .pp_plugin_base import PipelinePluginBase
 
 SUPPORT_SP_MODE = ["split_gather", "ring", "all_to_all"]
@@ -66,6 +67,7 @@ def __init__(
         ddp_config: dict,
         custom_policy: Policy,
         overlap_allgather: bool = False,
+        use_fp8: bool = False,
     ) -> None:
         self.stage_manager = shard_config.pipeline_stage_manager
         self.shard_config = shard_config
@@ -75,6 +77,7 @@ def __init__(
         self.use_dpp = use_ddp
         self.require_grad_sync = True
         self.overlap_allgather = overlap_allgather
+        self.use_fp8 = use_fp8
 
         shardformer = ShardFormer(shard_config)
         if custom_policy is not None:
@@ -112,8 +115,12 @@ def __init__(
             module = DDP(module, process_group=dp_group, **ddp_config)
 
         super().__init__(module)
+        self.op_hooks = []
         if overlap_allgather:
-            self.op_hook = ZeroOpHook()
+            self.op_hooks.append(ZeroOpHook())
+        if use_fp8:
+            self.op_hooks.append(FP8Hook())
+        if overlap_allgather or use_fp8:
             for p in module.parameters():
                 if p.requires_grad and type(p) is not ColoParameter:
                     p.__class__ = ColoParameter
@@ -223,7 +230,11 @@ def _force_wait_all_gather(self):
             wait_all_gather_handle(p)
 
     def _wait_all_gather(self):
-        return ColoParamOpHookManager.use_hooks(self.op_hook) if self.overlap_allgather else nullcontext()
+        return (
+            ColoParamOpHookManager.use_hooks(*self.op_hooks)
+            if (self.overlap_allgather or self.use_fp8)
+            else nullcontext()
+        )
 
 
 def get_param_info(optim: Optimizer):
@@ -1019,6 +1030,7 @@ def __init__(
         overlap_p2p: bool = True,
         overlap_allgather: bool = False,
         fp8_communication: bool = False,
+        use_fp8: bool = False,
     ) -> None:
         super().__init__()
 
@@ -1063,6 +1075,7 @@ def __init__(
         self.enable_flash_attention = enable_flash_attention
         self.enable_jit_fused = enable_jit_fused
         self.enable_sequence_parallelism = enable_sequence_parallelism
+        self.use_fp8 = use_fp8
         if dp_outside:
             self.dp_axis, self.pp_axis, self.tp_axis, self.sp_axis = 0, 1, 2, 3
             self.pg_mesh = ProcessGroupMesh(self.dp_size, self.pp_size, self.tp_size, self.sp_size)
@@ -1243,6 +1256,7 @@ def configure(
                 ddp_config=self.ddp_config,
                 custom_policy=self.custom_policy,
                 overlap_allgather=(self.zero_stage > 0 and self.zero_config["overlap_allgather"]),
+                use_fp8=self.use_fp8,
             )
         if optimizer is not None and not isinstance(optimizer, OptimizerWrapper):
             if zero_stage == 0:
diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index bc8c3ced4cdd..6d777e8a459d 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -431,7 +431,8 @@ def forward(
         if bias is not None:
             assert bias.dtype == x.dtype, "Bias should have the same dtype as input."
         # ensure x and w are row-major
-        assert x.is_contiguous() and w.is_contiguous(), "Input and weight should be contiguous."
+        x = x.contiguous()
+        w = w.contiguous()
         ctx.x_shape = x.shape
         ctx.has_bias = bias is not None
         ctx.out_dtype = x.dtype
diff --git a/colossalai/tensor/colo_parameter.py b/colossalai/tensor/colo_parameter.py
index acb9fc4ae8fc..8992b89a3c39 100644
--- a/colossalai/tensor/colo_parameter.py
+++ b/colossalai/tensor/colo_parameter.py
@@ -61,6 +61,8 @@ def __torch_function__(cls, func, types, args=..., kwargs=None):
                 with torch._C.DisableTorchFunction():
                     new_args = ColoParamOpHookManager.pre_op(params, *args, *kwargs.values())
                 args, kwargs = replace_args(args, kwargs, new_args)
+                with torch._C.DisableTorchFunction():
+                    func = ColoParamOpHookManager.rewrite_op(func)
                 ret = super().__torch_function__(func, types, args, kwargs)
                 with torch._C.DisableTorchFunction():
                     ret = ColoParamOpHookManager.post_op(params, ret)
diff --git a/colossalai/tensor/param_op_hook.py b/colossalai/tensor/param_op_hook.py
index 40de43c43b05..c8dd5a0c8407 100644
--- a/colossalai/tensor/param_op_hook.py
+++ b/colossalai/tensor/param_op_hook.py
@@ -30,6 +30,9 @@ def pre_backward(self, params: List[torch.Tensor]) -> None:
     def post_backward(self, params: List[torch.Tensor]) -> None:
         pass
 
+    def rewrite_op(self, func) -> Any:
+        return func
+
 
 class ColoParamOpHookManager:
     """
@@ -101,6 +104,12 @@ def post_op(params: List[torch.Tensor], arg: Any) -> Any:
     def has_hook() -> bool:
         return len(ColoParamOpHookManager.hooks) > 0
 
+    @staticmethod
+    def rewrite_op(func) -> Any:
+        for hook in ColoParamOpHookManager.hooks:
+            func = hook.rewrite_op(func)
+        return func
+
 
 class PreFwdPostBwd(torch.autograd.Function):
     @staticmethod
diff --git a/tests/test_fp8/test_fp8_hook.py b/tests/test_fp8/test_fp8_hook.py
new file mode 100644
index 000000000000..6cc147be72b7
--- /dev/null
+++ b/tests/test_fp8/test_fp8_hook.py
@@ -0,0 +1,50 @@
+import pytest
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from colossalai.accelerator import get_accelerator
+from colossalai.booster.plugin.fp8_hook import FP8Hook
+from colossalai.quantization.fp8 import linear_fp8
+from colossalai.tensor.colo_parameter import ColoParameter
+from colossalai.tensor.param_op_hook import ColoParamOpHookManager
+from colossalai.utils import get_current_device
+
+REPLACED = False
+TRIGGERED = False
+
+
+def new_linear_fp8(x, w, bias=None):
+    global TRIGGERED
+    TRIGGERED = True
+    return linear_fp8(x, w, bias)
+
+
+class FP8TestHook(FP8Hook):
+    def rewrite_op(self, func):
+        func = super().rewrite_op(func)
+        if func is linear_fp8:
+            global REPLACED
+            REPLACED = True
+            return new_linear_fp8
+        return func
+
+
+D_IN, D_OUT = 16, 32
+B, S = 2, 64
+DTYPE = torch.bfloat16
+
+
+@pytest.mark.skipif(get_accelerator().get_device_capability()[0] < 9, reason="Test requires device capability >= 9.0")
+def test_fp8_hook():
+    # create tensors
+    w = nn.Parameter(torch.rand(D_OUT, D_IN, device=get_current_device(), dtype=DTYPE))
+    x = torch.rand(B, S, D_IN, device=get_current_device(), dtype=DTYPE, requires_grad=True)
+    w.__class__ = ColoParameter
+    w.__init__(w, requires_grad=True)
+    hook = FP8TestHook()
+    with ColoParamOpHookManager.use_hooks(hook):
+        o = F.linear(x, w)
+    assert o.shape == (B, S, D_OUT)
+    assert REPLACED
+    assert TRIGGERED

From 7739629b9d89ea978a68c5746186d920f592f9d4 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Wed, 7 Aug 2024 18:58:39 +0800
Subject: [PATCH 020/167] fix (#5976)

---
 colossalai/quantization/fp8.py | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index 6d777e8a459d..52bb8cc9bc33 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -376,28 +376,6 @@ def all_to_all_fp8(output_list, input_list, group=None, fp8_format="e5m2"):
         output_list[i].copy_(cast_from_fp8(tensor, scale, input_type))
 
 
-def all_to_all_single_fp8(output_tensor, input_tensor, group=None, fp8_format="e5m2"):
-
-    world_size = dist.get_world_size(group)
-
-    per_slice_len = input_tensor.size(0) // world_size
-    input_type = input_tensor.dtype
-    ret, scale = cast_to_fp8(input_tensor, fp8_format=fp8_format)
-    fp8_type = ret.dtype
-    input_tensor = ret.view(torch.uint8)
-    tensor = torch.empty_like(input_tensor)
-    scale_list = [torch.empty_like(scale) for _ in range(world_size)]
-    dist.all_to_all_single(tensor, input_tensor, group=group)
-    dist.all_gather(scale_list, scale, group=group)
-    cast_tensor_list = []
-
-    for i in range(world_size):
-        output_part = tensor[per_slice_len * i : per_slice_len * (i + 1)].view(fp8_type)
-        output_part = cast_from_fp8(output_part, scale_list[i], input_type)
-        cast_tensor_list.append(output_part)
-    output_tensor.copy_(torch.concatenate(cast_tensor_list, dim=0))
-
-
 def gather_fp8(output_list, input_, group=None, fp8_format="e5m2"):
 
     world_size = dist.get_world_size(group)

From b480eec738b58bd40a6e4cc656c8439ffed0be98 Mon Sep 17 00:00:00 2001
From: Hanks <hangxu0304@gmail.com>
Date: Thu, 8 Aug 2024 15:55:01 +0800
Subject: [PATCH 021/167] [Feature]: support FP8 communication in DDP, FSDP,
 Gemini (#5928)

* support fp8_communication in the Torch DDP grad comm, FSDP grad comm, and FSDP params comm

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* implement communication hook for FSDP params all-gather

* added unit test for fp8 operators

* support fp8 communication in GeminiPlugin

* update training scripts to support fsdp and fp8 communication

* fixed some minor bugs observed in unit test

* add all_gather_into_tensor_flat_fp8

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add skip the test if torch < 2.2.0

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add skip the test if torch < 2.2.0

* add skip the test if torch < 2.2.0

* add fp8_comm flag

* rebase latest fp8 operators

* rebase latest fp8 operators

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 colossalai/booster/plugin/gemini_plugin.py    |   2 +
 colossalai/booster/plugin/torch_ddp_plugin.py |   7 +
 .../booster/plugin/torch_fsdp_plugin.py       |  15 ++
 colossalai/quantization/fp8.py                | 207 +++++++++++++++++-
 colossalai/quantization/utils.py              | 112 ++++++++++
 colossalai/zero/gemini/chunk/chunk.py         |  15 +-
 colossalai/zero/gemini/chunk/manager.py       |   4 +
 colossalai/zero/gemini/gemini_ddp.py          |   3 +
 examples/language/bert/finetune.py            |  17 +-
 .../gpt/hybridparallelism/finetune.py         |   2 +-
 examples/language/llama/benchmark.py          |  12 +-
 tests/test_fp8/test_fp8_cast.py               |  26 +++
 tests/test_fp8/test_fp8_ddp_comm_hook.py      |  87 ++++++++
 tests/test_fp8/test_fp8_fsdp_comm_hook.py     | 107 +++++++++
 14 files changed, 602 insertions(+), 14 deletions(-)
 create mode 100644 colossalai/quantization/utils.py
 create mode 100644 tests/test_fp8/test_fp8_cast.py
 create mode 100644 tests/test_fp8/test_fp8_ddp_comm_hook.py
 create mode 100644 tests/test_fp8/test_fp8_fsdp_comm_hook.py

diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py
index ad131fbe739a..5ab8f05adc85 100644
--- a/colossalai/booster/plugin/gemini_plugin.py
+++ b/colossalai/booster/plugin/gemini_plugin.py
@@ -364,6 +364,7 @@ def __init__(
         enable_sequence_overlap: bool = False,
         enable_async_reduce: bool = True,
         verbose: bool = False,
+        fp8_communication: bool = False,
     ) -> None:
         super().__init__()
         assert precision in SUPPORTED_PRECISION, f"precision {precision} is not supported"
@@ -395,6 +396,7 @@ def __init__(
             master_weights=master_weights,
             max_prefetch=max_prefetch,
             enable_async_reduce=enable_async_reduce,
+            fp8_communication=fp8_communication,
         )
         self.zero_optim_config = dict(
             gpu_margin_mem_ratio=gpu_margin_mem_ratio,
diff --git a/colossalai/booster/plugin/torch_ddp_plugin.py b/colossalai/booster/plugin/torch_ddp_plugin.py
index 5116446a4295..34caa2f684ba 100644
--- a/colossalai/booster/plugin/torch_ddp_plugin.py
+++ b/colossalai/booster/plugin/torch_ddp_plugin.py
@@ -177,6 +177,7 @@ def __init__(
         check_reduction: bool = False,
         gradient_as_bucket_view: bool = False,
         static_graph: bool = False,
+        fp8_communication: bool = False,
     ) -> None:
         super().__init__()
         self.ddp_kwargs = dict(
@@ -187,6 +188,7 @@ def __init__(
             gradient_as_bucket_view=gradient_as_bucket_view,
             static_graph=static_graph,
         )
+        self.fp8_communication = fp8_communication
 
     def support_no_sync(self) -> bool:
         return True
@@ -226,6 +228,11 @@ def configure(
         if optimizer is not None and not isinstance(optimizer, OptimizerWrapper):
             optimizer = OptimizerWrapper(optimizer)
 
+        if self.fp8_communication:
+            from colossalai.quantization.fp8 import fp8_compress_ddp_grad_comm_hook_async
+
+            model.module.register_comm_hook(None, fp8_compress_ddp_grad_comm_hook_async)
+
         return model, optimizer, criterion, dataloader, lr_scheduler
 
     def control_checkpoint_io(self) -> bool:
diff --git a/colossalai/booster/plugin/torch_fsdp_plugin.py b/colossalai/booster/plugin/torch_fsdp_plugin.py
index cd2f9e84018a..e3f81928de5e 100644
--- a/colossalai/booster/plugin/torch_fsdp_plugin.py
+++ b/colossalai/booster/plugin/torch_fsdp_plugin.py
@@ -298,6 +298,7 @@ def __init__(
             ignored_modules: Optional[Iterable[torch.nn.Module]] = None,
             param_init_fn: Optional[Callable[[nn.Module], None]] = None,
             sync_module_states: bool = False,
+            fp8_communication: bool = False,
         ):
             super().__init__()
             self.fsdp_kwargs = dict(
@@ -311,6 +312,7 @@ def __init__(
                 param_init_fn=param_init_fn,
                 sync_module_states=sync_module_states,
             )
+            self.fp8_communication = fp8_communication
 
     else:
         raise RuntimeError("FSDP is not supported while torch version under 1.12.0.")
@@ -347,6 +349,19 @@ def configure(
         # wrap the model with PyTorch FSDP
         fsdp_model = TorchFSDPModel(model, device_id=torch.cuda.current_device(), **self.fsdp_kwargs)
 
+        if self.fp8_communication:
+            from colossalai.quantization.utils import patch_fsdp_params_comm_hook
+
+            patch_fsdp_params_comm_hook()
+
+            from colossalai.quantization.fp8 import fp8_compress_fsdp_params_comm_hook
+
+            fsdp_model.module.register_params_comm_hook(None, fp8_compress_fsdp_params_comm_hook)
+
+            from colossalai.quantization.fp8 import fp8_compress_fsdp_grad_comm_hook
+
+            fsdp_model.module.register_comm_hook(None, fp8_compress_fsdp_grad_comm_hook)
+
         if optimizer is not None:
             if len(optimizer.param_groups) > 1:
                 warnings.warn(
diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index 52bb8cc9bc33..e933680a931c 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -15,6 +15,7 @@ def cast_to_fp8(inp: torch.Tensor, fp8_format="e4m3", per_channel_scale=False) -
         scale: scaling factor for fp8 casting. If it is None, then it is computed automatically. Per-channel scaling
         is applied if input tensor is 2 dimension, otherwise, per-tensor scaling is applied.
         fp8_format: e4m3 or e5m2
+
     Returns:
         Tuples: A tuple (fp8_tensor, scale)
     """
@@ -29,12 +30,13 @@ def cast_to_fp8(inp: torch.Tensor, fp8_format="e4m3", per_channel_scale=False) -
         per_channel_max = inp.abs().max(dim=-1).values.float()
         per_channel_max = torch.where(per_channel_max > 0, per_channel_max, 1.0)
         scale = fp8_max / per_channel_max[:, None]
+        scale_inv = per_channel_max / fp8_max
     else:
         per_tensor_max = inp.abs().max().float()
         per_tensor_max = torch.where(per_tensor_max > 0, per_tensor_max, 1.0)
         scale = fp8_max / per_tensor_max
+        scale_inv = 1.0 / scale
 
-    scale_inv = 1.0 / scale
     ret = (scale * inp.float()).to(fp8_type)
     return ret, scale_inv
 
@@ -185,7 +187,11 @@ def cast_to_fp8_pipeline(inp: Any) -> None:
         return
 
     assert "hidden_states" in inp, "required by pipeline parallelism."
+    assert (
+        inp["hidden_states"].size(-1) % 2 == 0
+    ), "tensor size(-1) must be divisible by 2 to view Float8_e4m3fn as BFloat16 or Float16"
     inp_tensor = inp["hidden_states"]
+    inp_dtype = inp_tensor.dtype
 
     min_val, max_val = inp_tensor.aminmax()
     amax = torch.maximum(min_val.abs(), max_val.abs())
@@ -206,6 +212,7 @@ def cast_to_fp8_pipeline(inp: Any) -> None:
     inp_tensor.data = q_tensor.to(fp8_type).view(fp8_view_type)
 
     inp["fp8_scale"] = scale.float().reciprocal()
+    inp["dtype"] = torch.zeros_like(scale).to(inp_dtype)
 
 
 def cast_from_fp8_pipeline(inp: Any, del_metadata=True) -> None:
@@ -230,10 +237,11 @@ def cast_from_fp8_pipeline(inp: Any, del_metadata=True) -> None:
     else:
         raise TypeError("Only float16, bfloat16 are implemented.")
 
-    inp_tensor.data = inp_tensor.data.view(fp8_type).to(torch.float16) * scale
+    inp_tensor.data = inp_tensor.data.view(fp8_type).to(inp["dtype"]) * scale
 
     if del_metadata:
         del inp["fp8_scale"]
+        del inp["dtype"]
 
 
 def reduce_scatter_fp8(output: torch.Tensor, input_list, group, fp8_format="e5m2") -> None:
@@ -273,6 +281,199 @@ def reduce_scatter_fp8(output: torch.Tensor, input_list, group, fp8_format="e5m2
     output.data = summed_out
 
 
+def fp8_compress_ddp_grad_comm_hook_async(
+    process_group: dist.ProcessGroup,
+    bucket: dist.GradBucket,
+    fp8_format: str = "e5m2",
+) -> torch.futures.Future[torch.Tensor]:
+    """
+    Compress by casting ``GradBucket`` to FP8 floating-point format divided by process group size.
+
+    This DDP communication hook implements a simple gradient compression approach that casts ``GradBucket`` tensor
+    to FP8 floating-point format (``torch.float8_e5m2`` or ``torch.bfloat16_e4m3``), and then divides it
+    by the process group size.
+    Once compressed gradient tensors are allreduced, the chained callback ``decompress`` casts it back
+    to the input data type (such as ``float32``).
+
+    Example::
+        >>> ddp_model.register_comm_hook(process_group, fp8_compress_ddp_grad_comm_hook_async)
+    """
+    group_to_use = process_group if process_group is not None else dist.group.WORLD
+
+    input_tensor = bucket.buffer()
+    world_size = dist.get_world_size()
+    input_type = input_tensor.dtype
+    input_device = input_tensor.device
+    flat_padded_x = input_tensor.flatten()
+
+    if flat_padded_x.size(0) % world_size != 0:
+        pad_size = world_size - flat_padded_x.size(0) % world_size
+        flat_padded_x = F.pad(flat_padded_x, (0, pad_size))
+
+    fp8_type = torch.float8_e4m3fn if fp8_format == "e4m3" else torch.float8_e5m2
+    ret, scale = cast_to_fp8(flat_padded_x, fp8_format=fp8_format)
+
+    inp = ret.view(torch.uint8)
+    output_chunks_single = torch.empty_like(inp)
+    split_sizes = [inp.numel() // world_size for _ in range(world_size)]
+    fut0 = dist.all_to_all_single(
+        output_chunks_single,
+        inp,
+        output_split_sizes=split_sizes,
+        input_split_sizes=split_sizes,
+        group=group_to_use,
+        async_op=True,
+    ).get_future()
+
+    scale_list = [torch.ones(1, dtype=scale.dtype, device=input_device) for _ in range(world_size)]
+    fut1 = dist.all_gather_into_tensor(
+        torch.cat(scale_list, dim=0), scale, group=group_to_use, async_op=True
+    ).get_future()
+    all_to_all_fut = torch.futures.collect_all([fut0, fut1])
+
+    def sum_and_allgather(fut):
+        output_chunks_single = fut.value()[0].wait()[0]
+        scale_list_single = fut.value()[1].wait()[0]
+
+        output_chunks = list(torch.chunk(output_chunks_single, world_size, dim=0))
+        scale_list = scale_list_single.chunk(world_size, dim=0)
+
+        summed_out = torch.zeros_like(output_chunks[0]).to(input_type)
+        for scale, out in zip(scale_list, output_chunks):
+            out = out.view(fp8_type)
+            summed_out += cast_from_fp8(out, scale, input_type)
+        summed_out.div_(world_size)
+
+        summed_out_fp8, scale = cast_to_fp8(summed_out, fp8_format=fp8_format)
+
+        tensor_list_single = torch.empty(summed_out_fp8.size(0) * world_size, device=input_device, dtype=torch.uint8)
+        fut2 = dist.all_gather_into_tensor(
+            tensor_list_single, summed_out_fp8.view(torch.uint8), group=group_to_use, async_op=True
+        ).get_future()
+
+        scale_list = [torch.ones(1, dtype=scale.dtype, device=input_device) for _ in range(world_size)]
+        fut3 = dist.all_gather_into_tensor(
+            torch.cat(scale_list, dim=0), scale, group=group_to_use, async_op=True
+        ).get_future()
+        fut_combined2 = torch.futures.collect_all([fut2, fut3])
+        return fut_combined2
+
+    def decompress(fut):
+        tensor_list_single = fut.value().wait()[0].value()[0]
+        scale_list_single = fut.value().wait()[1].value()[0]
+
+        tensor_list = list(torch.chunk(tensor_list_single, world_size, dim=0))
+        scale_list = scale_list_single.chunk(world_size, dim=0)
+
+        for i in range(world_size):
+            tensor_list[i] = tensor_list[i].view(fp8_type).to(input_type) * scale_list[i]
+        out = torch.cat(tensor_list, dim=0)
+
+        input_tensor_size = input_tensor.numel()
+        input_shape = input_tensor.shape
+        out = out[:input_tensor_size]
+
+        input_tensor.copy_(out.view(input_shape).to(input_type))
+        return input_tensor
+
+    return all_to_all_fut.then(sum_and_allgather).then(decompress)
+
+
+def fp8_compress_ddp_grad_comm_hook_sync(
+    process_group: dist.ProcessGroup,
+    bucket: dist.GradBucket,
+    fp8_format="e5m2",
+) -> torch.futures.Future[torch.Tensor]:
+    """
+    Return a future that wraps the input, after the input is allreduced. However, the allreduce commnunication is synchronized.
+    This breaks the overlapping between allreduce communication and backward compuation.
+
+    This hook should **only** be used for debugging purposes, instead of the normal gradient synchronization.
+    For asynchronized implementation, use fp8_compress_ddp_grad_comm_hook_async instead.
+
+    Example::
+        >>> # xdoctest: +SKIP
+        >>> ddp_model.register_comm_hook(None, fp8_compress_ddp_grad_comm_hook_sync)
+    """
+
+    buffer = bucket.buffer()
+    all_reduce_fp8(buffer, fp8_format=fp8_format)
+
+    fut: torch.futures.Future[torch.Tensor] = torch.futures.Future()
+    fut.set_result(bucket.buffer())
+
+    return fut
+
+
+def fp8_compress_fsdp_grad_comm_hook(
+    state: object,
+    unsharded_gradient_flattened: torch.Tensor,
+    sharded_gradient: torch.Tensor,
+    group=None,
+    fp8_format="e5m2",
+) -> None:
+    """
+    This communication hook implements a simple gradient compression approach that casts unsharded_gradient_flattened tensor
+    to FP8 floating-point format (``torch.float8_e5m2`` or ``torch.bfloat16_e4m3``), and then perform scatter_allreduce logic
+    by using all_to_all and all_gather among the process group.
+
+    Example::
+        >>> fsdp_model.register_comm_hook(None, fp8_compress_fsdp_grad_comm_hook)
+    """
+    grad = unsharded_gradient_flattened
+    fp8_type = torch.float8_e4m3fn if fp8_format == "e4m3" else torch.float8_e5m2
+    input_type = grad.dtype
+    input_device = grad.device
+    world_size = dist.get_world_size(group=group)
+
+    grad_fp8, scale = cast_to_fp8(grad, fp8_format=fp8_format)
+    uint8_buffer = torch.empty_like(grad_fp8).view(torch.uint8)
+    dist.all_to_all_single(uint8_buffer, grad_fp8.view(torch.uint8), group=group)
+
+    scale_list = [torch.ones(1, dtype=scale.dtype, device=input_device) for _ in range(world_size)]
+    dist.all_gather(scale_list, scale, group=group)
+
+    buffer_list = list(torch.chunk(uint8_buffer.view(fp8_type), world_size, dim=0))
+    sharded_gradient.zero_()
+    for tensor, scale in zip(buffer_list, scale_list):
+        sharded_gradient += cast_from_fp8(tensor, scale, input_type)
+
+
+def fp8_compress_fsdp_params_comm_hook(
+    state: object,
+    padded_unsharded_flat_param: torch.Tensor,
+    sharded_flat_param: torch.Tensor,
+    group=None,
+    fp8_format="e5m2",
+) -> None:
+    """
+        This hook is pending the official support for parameters communication hook in FSDP, e.g. register_params_comm_hook.
+
+    Example::
+        >>> fsdp_model.register_params_comm_hook(None, fp8_compress_fsdp_params_comm_hook)
+    """
+
+    fp8_type = torch.float8_e4m3fn if fp8_format == "e4m3" else torch.float8_e5m2
+    fp8_max = torch.finfo(fp8_type).max
+    inp = sharded_flat_param
+    out = padded_unsharded_flat_param
+
+    per_tensor_max = inp.abs().max().float()
+    per_tensor_max = torch.where(per_tensor_max > 0, per_tensor_max, 1.0)
+    dist.all_reduce(per_tensor_max, op=torch.distributed.ReduceOp.MAX, group=group)
+
+    scale = fp8_max / per_tensor_max
+    fp8_sharded_flat_param = (scale * inp.float()).to(fp8_type).view(torch.uint8)
+
+    fp8_out = torch.empty(out.shape, dtype=torch.uint8, device=out.device)
+    dist.all_gather_into_tensor(
+        fp8_out,
+        fp8_sharded_flat_param,
+        group=group,
+    )
+    padded_unsharded_flat_param.copy_((fp8_out.view(fp8_type).float() / scale).to(out.dtype))
+
+
 def split_chunk_by_channel(
     chunk: torch.Tensor, channel_size: int, num_channels: int, rank: int = 0, world_size: int = 1
 ):
@@ -342,7 +543,7 @@ def all_gather_into_tensor_flat_fp8(
     scale_inv = 1.0 / scale
     buffer = torch.empty_like(output_tensor, dtype=fp8_type)
     dist.all_gather_into_tensor(buffer.view(torch.uint8), fp8_input.view(torch.uint8), group=group)
-    numel = np.prod(output_shape)
+    numel = output_shape.numel()
     valid_buffer = buffer[:numel].reshape(output_shape)
     valid_buffer = cast_from_fp8(valid_buffer, scale_inv, input_type, per_channel_scale=(len(output_shape) == 2))
     output_tensor[:numel].copy_(valid_buffer.view(-1))
diff --git a/colossalai/quantization/utils.py b/colossalai/quantization/utils.py
new file mode 100644
index 000000000000..5b1e11c9f345
--- /dev/null
+++ b/colossalai/quantization/utils.py
@@ -0,0 +1,112 @@
+import torch
+import torch.distributed as dist
+from packaging import version
+from torch import Tensor
+from torch.distributed.fsdp._common_utils import _no_dispatch_record_stream
+from torch.distributed.utils import _p_assert
+
+
+def _all_gather_flat_param(
+    self,
+    padded_unsharded_flat_param: Tensor,
+) -> Tensor:
+    """
+    All-gather the handle's flat parameter to the destination ``padded_unsharded_flat_param``.
+
+    Then switch to use the all-gathered tensor.
+    """
+    _p_assert(
+        hasattr(self, "process_group") and hasattr(self, "world_size"),
+        "Expects a process group and world size to have been set via `shard()`",
+    )
+    sharded_flat_param = self.flat_param.data
+    expected_numel = sharded_flat_param.numel() * self.world_size
+    _p_assert(
+        padded_unsharded_flat_param.numel() == expected_numel,
+        f"Expects {expected_numel} numel but got {padded_unsharded_flat_param.numel()}",
+    )
+
+    pg = self._fake_process_group if self._use_fake_all_gather else self.process_group
+
+    # HACK this should be handled by C10D
+    if sharded_flat_param.is_cpu:  # type: ignore[attr-defined]
+        tensor_list = list(torch.chunk(padded_unsharded_flat_param, dist.get_world_size(pg)))
+        work = dist.all_gather(tensor_list, sharded_flat_param, group=pg)
+    else:
+        if self._comm_hook is None:
+            dist.all_gather_into_tensor(
+                padded_unsharded_flat_param,
+                sharded_flat_param,
+                pg,
+            )
+        else:
+            self._comm_hook(None, padded_unsharded_flat_param, sharded_flat_param, pg)
+
+    if self._offload_params:
+        # In case of offloading, `flat_param.data` (i.e. sharded param) is
+        # created on the pre-unshard stream. We need to hand it over to the
+        # unshard stream for all-gather
+        _no_dispatch_record_stream(
+            sharded_flat_param,
+            self._device_handle.current_stream(),  # unshard_stream
+        )
+    return padded_unsharded_flat_param
+
+
+def register_params_comm_hook(self, state: object, hook: callable):
+    """Register a communication hook for FlatParamHandle.
+
+    This is an enhancement that provides a flexible hook to users where they can specify how FSDP unshards
+    parameters across multiple workers.
+
+    .. warning ::
+        FSDP communication hook should be registered before running an initial forward pass
+        and only once.
+
+    Args:
+        state (object): Passed to the hook to maintain any state information during the training process.
+        hook (Callable): Callable, which has one of the following signatures:
+                        1) ``hook: Callable[torch.Tensor] -> None``:
+                        This function takes in a Python tensor, which represents
+                        the full, flattened, unsharded gradient with respect to all variables
+                        corresponding to the model this FSDP unit is wrapping
+                        (that are not wrapped by other FSDP sub-units).
+                        It then performs all necessary processing and returns ``None``;
+                        2) ``hook: Callable[torch.Tensor, torch.Tensor] -> None``:
+                        This function takes in two Python tensors, the first one represents
+                        the full, flattened, unsharded gradient with respect to all variables
+                        corresponding to the model this FSDP unit is wrapping
+                        (that are not wrapped by other FSDP sub-units). The latter
+                        represents a pre-sized tensor to store a chunk of a sharded gradient after
+                        reduction.
+                        In both cases, callable performs all necessary processing and returns ``None``.
+                        Callables with signature 1 are expected to handle gradient communication for a `NO_SHARD` case.
+                        Callables with signature 2 are expected to handle gradient communication for sharded cases.
+
+    """
+    if not self.check_is_root():
+        raise AssertionError("register_comm_hook can only be called on a root instance.")
+
+    # if fsdp_state.sharding_strategy in HYBRID_SHARDING_STRATEGIES:
+    #     raise AssertionError(
+    #         f"Communication hook is not supported for hybrid strategies: {fsdp_state.sharding_strategy}"
+    #     )
+    if self._handle._comm_hook is not None:
+        raise AssertionError("A communication hook is already registered")
+    if not callable(hook):
+        raise ValueError(f"The communication hook must be callable but got {hook}")
+    self._handle._comm_hook = hook
+    self._handle._comm_hook_state = state
+
+
+def patch_fsdp_params_comm_hook():
+    if version.parse(torch.__version__) >= version.parse("2.2.0"):
+        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+        from torch.distributed.fsdp._flat_param import FlatParamHandle
+
+        FlatParamHandle._comm_hook = None
+        FlatParamHandle._comm_hook_state = None
+        FlatParamHandle._all_gather_flat_param = _all_gather_flat_param
+        FSDP.register_params_comm_hook = register_params_comm_hook
+    else:
+        raise RuntimeError("This fsdp_params_comm_hook patch is not supported while torch version under 2.2.0.")
diff --git a/colossalai/zero/gemini/chunk/chunk.py b/colossalai/zero/gemini/chunk/chunk.py
index 969df96214de..e2b7a8f56432 100644
--- a/colossalai/zero/gemini/chunk/chunk.py
+++ b/colossalai/zero/gemini/chunk/chunk.py
@@ -166,6 +166,7 @@ def __init__(
         self.grad_chunk = None
         # the async all-reduce/reduce-scatter work of this grad chunk (None means sync)
         self.grad_reduce_work = None
+        self.fp8_communication = False
 
     @property
     def memory_usage(self) -> Dict[str, int]:
@@ -521,9 +522,17 @@ def __gather(self, async_op: bool = False) -> Optional[dist.Work]:
 
             alloc_storage(self.cuda_global_chunk)
             assert self.cuda_global_chunk.is_contiguous()
-            work = dist.all_gather_into_tensor(
-                self.cuda_global_chunk, self.cuda_shard, self.torch_pg, async_op=async_op
-            )
+            if self.fp8_communication:
+                assert async_op == False, "fp8 all-gather does not support async_op!"
+                from colossalai.quantization.fp8 import all_gather_into_tensor_flat_fp8
+
+                work = all_gather_into_tensor_flat_fp8(
+                    self.cuda_global_chunk, self.cuda_shard, self.cuda_global_chunk.shape, self.torch_pg
+                )
+            else:
+                work = dist.all_gather_into_tensor(
+                    self.cuda_global_chunk, self.cuda_shard, self.torch_pg, async_op=async_op
+                )
 
             self.cuda_shard = None
             self.is_gathered = True
diff --git a/colossalai/zero/gemini/chunk/manager.py b/colossalai/zero/gemini/chunk/manager.py
index d0e1755f40cb..06f9b6d18a6d 100644
--- a/colossalai/zero/gemini/chunk/manager.py
+++ b/colossalai/zero/gemini/chunk/manager.py
@@ -26,6 +26,7 @@ def __init__(
         init_device: Optional[torch.device] = None,
         reuse_fp16_chunk: bool = True,
         max_prefetch: int = 0,
+        fp8_communication: bool = False,
     ) -> None:
         self.device = init_device or get_accelerator().get_current_device()
         self.dp_degree_chunk_size_dict: Dict[int, int] = dict()
@@ -44,6 +45,7 @@ def __init__(
         self.accumulating_grads = False
         self.overflow_counter = torch.tensor([0], dtype=torch.int, device=get_accelerator().get_current_device())
         self._prefetch_stream = get_accelerator().Stream() if max_prefetch else None
+        self.fp8_communication = fp8_communication
 
     def register_tensor(
         self,
@@ -101,6 +103,8 @@ def register_tensor(
                 extra_dp_group=extra_dp_group,
                 **chunk_kwargs,
             )
+            if self.fp8_communication:
+                chunk.fp8_communication = True
 
             chunk_group.append(chunk)
             chunk.append_tensor(tensor)
diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py
index 80b2c7961e29..0b2039a4de0c 100644
--- a/colossalai/zero/gemini/gemini_ddp.py
+++ b/colossalai/zero/gemini/gemini_ddp.py
@@ -98,6 +98,7 @@ def __init__(
         extra_dp_group: Optional[ProcessGroup] = None,
         verbose: bool = False,
         enable_async_reduce: bool = True,
+        fp8_communication: bool = False,
     ) -> None:
         assert mixed_precision in (torch.float16, torch.bfloat16)
         reuse_fp16_chunk = master_weights if not enable_gradient_accumulation else False
@@ -122,6 +123,8 @@ def __init__(
                 verbose=verbose,
                 max_prefetch=max_prefetch,
             )
+        if fp8_communication:
+            self.chunk_manager.fp8_communication = True
         self.gemini_manager = GeminiManager(
             placement_policy,
             self.chunk_manager,
diff --git a/examples/language/bert/finetune.py b/examples/language/bert/finetune.py
index 8a59ab6838a6..f048abdd253a 100644
--- a/examples/language/bert/finetune.py
+++ b/examples/language/bert/finetune.py
@@ -179,7 +179,7 @@ def main():
         "--plugin",
         type=str,
         default="torch_ddp",
-        choices=["torch_ddp", "torch_ddp_fp16", "gemini", "low_level_zero", "hybrid_parallel"],
+        choices=["torch_ddp", "torch_ddp_fp16", "gemini", "low_level_zero", "hybrid_parallel", "torch_fsdp"],
         help="plugin to use",
     )
     parser.add_argument(
@@ -215,9 +215,9 @@ def main():
     if args.plugin == "torch_ddp_fp16":
         booster_kwargs["mixed_precision"] = "fp16"
     if args.plugin.startswith("torch_ddp"):
-        plugin = TorchDDPPlugin()
+        plugin = TorchDDPPlugin(fp8_communication=args.use_fp8_comm)
     elif args.plugin == "gemini":
-        plugin = GeminiPlugin(initial_scale=2**5)
+        plugin = GeminiPlugin(initial_scale=2**5, fp8_communication=args.use_fp8_comm)
     elif args.plugin == "low_level_zero":
         plugin = LowLevelZeroPlugin(initial_scale=2**5)
     elif args.plugin == "hybrid_parallel":
@@ -235,6 +235,17 @@ def main():
             initial_scale=1,
             fp8_communication=args.use_fp8_comm,
         )
+    elif args.plugin == "torch_fsdp":
+        from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision
+
+        from colossalai.booster.plugin import TorchFSDPPlugin
+
+        plugin = TorchFSDPPlugin(
+            mixed_precision=MixedPrecision(
+                param_dtype=torch.float16, reduce_dtype=torch.float16, buffer_dtype=torch.float16
+            ),
+            fp8_communication=args.use_fp8_comm,
+        )
 
     booster = Booster(plugin=plugin, **booster_kwargs)
 
diff --git a/examples/language/gpt/hybridparallelism/finetune.py b/examples/language/gpt/hybridparallelism/finetune.py
index f447adf693f8..e9f7203e9a78 100644
--- a/examples/language/gpt/hybridparallelism/finetune.py
+++ b/examples/language/gpt/hybridparallelism/finetune.py
@@ -212,7 +212,7 @@ def main():
     if args.plugin == "torch_ddp_fp16":
         booster_kwargs["mixed_precision"] = "fp16"
     if args.plugin.startswith("torch_ddp"):
-        plugin = TorchDDPPlugin()
+        plugin = TorchDDPPlugin(fp8_communication=args.use_fp8_comm)
     elif args.plugin == "gemini":
         plugin = GeminiPlugin(initial_scale=2**5)
     elif args.plugin == "low_level_zero":
diff --git a/examples/language/llama/benchmark.py b/examples/language/llama/benchmark.py
index e530e2d6a153..2bd9671d81b7 100644
--- a/examples/language/llama/benchmark.py
+++ b/examples/language/llama/benchmark.py
@@ -98,7 +98,7 @@ def main():
     parser.add_argument("--disable-async-reduce", action="store_true", help="Disable the asynchronous reduce operation")
     parser.add_argument("--prefetch_num", type=int, default=0, help="chunk prefetch max number")
     parser.add_argument("--no_cache", action="store_true")
-    parser.add_argument("--overlap_allgather", action="store_true")
+    parser.add_argument("--use_fp8_comm", action="store_true", default=False, help="for using fp8 during communication")
     args = parser.parse_args()
 
     colossalai.launch_from_torch()
@@ -158,6 +158,7 @@ def empty_init():
                     buffer_dtype=torch.float16,
                 ),
                 param_init_fn=empty_init(),
+                fp8_communication=args.use_fp8_comm,
             )
         else:
             plugin = TorchFSDPPlugin(
@@ -165,7 +166,8 @@ def empty_init():
                     param_dtype=torch.float16,
                     reduce_dtype=torch.float16,
                     buffer_dtype=torch.float16,
-                )
+                ),
+                fp8_communication=args.use_fp8_comm,
             )
     elif args.plugin == "fsdp_cpu":
         if use_empty_init:
@@ -177,6 +179,7 @@ def empty_init():
                 ),
                 cpu_offload=CPUOffload(offload_params=True),
                 param_init_fn=empty_init(),
+                fp8_communication=args.use_fp8_comm,
             )
         else:
             plugin = TorchFSDPPlugin(
@@ -186,6 +189,7 @@ def empty_init():
                     buffer_dtype=torch.float16,
                 ),
                 cpu_offload=CPUOffload(offload_params=True),
+                fp8_communication=args.use_fp8_comm,
             )
     elif args.plugin == "3d":
         plugin = HybridParallelPlugin(
@@ -200,9 +204,9 @@ def empty_init():
             enable_flash_attention=args.xformers,
             microbatch_size=args.mbs,
             precision="bf16",
+            dp_outside=False,
             overlap_p2p=args.overlap,
             enable_metadata_cache=not args.no_cache,
-            overlap_allgather=args.overlap_allgather,
             **hybrid_kwargs,
         )
     elif args.plugin == "3d_cpu":
@@ -293,7 +297,7 @@ def empty_init():
     with get_profile_context(
         args.profile,
         args.ignore_steps,
-        1,  # avoid creating massive log files
+        len(dataloader) - 1,
         save_dir=f"profile/{time.strftime('%H:%M', time.localtime())}-{args.plugin}-llama-{args.config}",
     ) as prof:
         if isinstance(plugin, HybridParallelPlugin) and args.pp > 1:
diff --git a/tests/test_fp8/test_fp8_cast.py b/tests/test_fp8/test_fp8_cast.py
new file mode 100644
index 000000000000..db9a909e60a7
--- /dev/null
+++ b/tests/test_fp8/test_fp8_cast.py
@@ -0,0 +1,26 @@
+import torch
+from torch.testing import assert_close
+
+from colossalai.accelerator import get_accelerator
+from colossalai.quantization.fp8 import cast_from_fp8, cast_from_fp8_pipeline, cast_to_fp8, cast_to_fp8_pipeline
+from colossalai.testing import parameterize
+
+
+@parameterize("shape", [(100, 10), (10, 100), (3, 7), (2, 1), (1, 2), (2, 2), (4, 2), (5,), (4,), (2,)])
+@parameterize("dtype", [torch.bfloat16, torch.float16, torch.float32])
+@parameterize("fp8_format", ["e4m3", "e5m2"])
+def test_fp8_cast(shape, dtype, fp8_format):
+    x = torch.rand(shape, dtype=dtype, device=get_accelerator().get_current_device())
+    ret, scale_inv = cast_to_fp8(x, fp8_format=fp8_format)
+    out = cast_from_fp8(ret, scale_inv, x.dtype)
+    assert_close(out, x, rtol=0.1, atol=0.1)
+
+    if x.size(-1) % 2 == 0:
+        inp_dict = {"hidden_states": x.clone()}
+        cast_to_fp8_pipeline(inp_dict)
+        cast_from_fp8_pipeline(inp_dict)
+        assert_close(inp_dict["hidden_states"], x, rtol=0.1, atol=0.1)
+
+
+if __name__ == "__main__":
+    test_fp8_cast()
diff --git a/tests/test_fp8/test_fp8_ddp_comm_hook.py b/tests/test_fp8/test_fp8_ddp_comm_hook.py
new file mode 100644
index 000000000000..9bdfe17a1465
--- /dev/null
+++ b/tests/test_fp8/test_fp8_ddp_comm_hook.py
@@ -0,0 +1,87 @@
+import os
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import torch.nn as nn
+import torch.optim as optim
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.testing import assert_close
+
+# example modified from https://pytorch.org/tutorials/intermediate/ddp_tutorial.html
+
+
+def setup(rank, world_size):
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+
+    # initialize the process group
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+
+
+def cleanup():
+    dist.destroy_process_group()
+
+
+class ToyModel(nn.Module):
+    def __init__(self):
+        super(ToyModel, self).__init__()
+        self.net1 = nn.Linear(10, 10)
+        self.relu = nn.ReLU()
+        self.net2 = nn.Linear(10, 5)
+
+    def forward(self, x):
+        return self.net2(self.relu(self.net1(x)))
+
+
+def demo_basic(rank, world_size):
+    print(f"Running basic DDP example on rank {rank}.")
+    setup(rank, world_size)
+
+    def get_grads_after_one_iteration(hook=None):
+        torch.manual_seed(0)
+        # create model and move it to GPU with id rank
+        model = ToyModel().to(rank)
+
+        ddp_model = DDP(model, device_ids=[rank])
+
+        if hook is not None:
+            ddp_model.register_comm_hook(None, hook)
+
+        loss_fn = nn.MSELoss()
+        optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
+
+        optimizer.zero_grad()
+        outputs = ddp_model(torch.randn(20, 10))
+        labels = torch.randn(20, 5).to(rank)
+        loss_fn(outputs, labels).backward()
+        optimizer.step()
+
+        torch.distributed.barrier()
+
+        grad_dict = {}
+        for name, params in ddp_model.named_parameters():
+            grad_dict[name] = params.grad
+        return grad_dict
+
+    from colossalai.quantization.fp8 import fp8_compress_ddp_grad_comm_hook_async, fp8_compress_ddp_grad_comm_hook_sync
+
+    grad_dict = get_grads_after_one_iteration()
+    for hook in [fp8_compress_ddp_grad_comm_hook_sync, fp8_compress_ddp_grad_comm_hook_async]:
+        grad_dict_w_hook = get_grads_after_one_iteration(hook)
+        if dist.get_rank() == 0:
+            for name in grad_dict:
+                assert_close(grad_dict[name], grad_dict_w_hook[name], rtol=0.1, atol=0.1)
+
+    cleanup()
+
+
+def run_demo(demo_fn, world_size):
+    mp.spawn(demo_fn, args=(world_size,), nprocs=world_size, join=True)
+
+
+if __name__ == "__main__":
+    n_gpus = torch.cuda.device_count()
+    assert n_gpus >= 2, f"Requires at least 2 GPUs to run, but got {n_gpus}"
+    world_size = n_gpus
+    run_demo(demo_basic, world_size)
diff --git a/tests/test_fp8/test_fp8_fsdp_comm_hook.py b/tests/test_fp8/test_fp8_fsdp_comm_hook.py
new file mode 100644
index 000000000000..3d0660961f17
--- /dev/null
+++ b/tests/test_fp8/test_fp8_fsdp_comm_hook.py
@@ -0,0 +1,107 @@
+import pytest
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.optim as optim
+from packaging import version
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.testing import assert_close
+
+from colossalai import launch
+from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+
+# example modified from https://pytorch.org/tutorials/intermediate/ddp_tutorial.html
+
+
+def cleanup():
+    dist.destroy_process_group()
+
+
+class ToyModel(nn.Module):
+    def __init__(self):
+        super(ToyModel, self).__init__()
+        self.net1 = nn.Linear(100, 100)
+        self.relu = nn.ReLU()
+        self.net2 = nn.Linear(100, 50)
+
+    def forward(self, x):
+        return self.net2(self.relu(self.net1(x)))
+
+
+@parameterize("mode", ["grad", "params"])
+def run_model(mode):
+    rank = dist.get_rank()
+
+    from colossalai.quantization.utils import patch_fsdp_params_comm_hook
+
+    patch_fsdp_params_comm_hook()
+
+    def get_grads_after_one_iteration(grad_hook=None, params_hook=None):
+        torch.manual_seed(0)
+        # create model and move it to GPU with id rank
+        model = ToyModel().to(rank)
+        fsdp_model = FSDP(model)
+
+        if grad_hook is not None:
+            fsdp_model.register_comm_hook(None, grad_hook)
+
+        if params_hook is not None:
+            fsdp_model.register_params_comm_hook(None, params_hook)
+
+        loss_fn = nn.MSELoss()
+        optimizer = optim.SGD(fsdp_model.parameters(), lr=0.001)
+
+        optimizer.zero_grad()
+        outputs = fsdp_model(torch.randn(20, 100))
+        labels = torch.randn(20, 50).to(rank)
+        loss_fn(outputs, labels).backward()
+        optimizer.step()
+
+        torch.distributed.barrier()
+
+        grad_dict = {}
+        for name, params in fsdp_model.named_parameters():
+            grad_dict[name] = params.grad
+        return grad_dict
+
+    from colossalai.quantization.fp8 import fp8_compress_fsdp_grad_comm_hook, fp8_compress_fsdp_params_comm_hook
+
+    if mode == "grad":
+        grad_dict = get_grads_after_one_iteration()
+        for hook in [
+            fp8_compress_fsdp_grad_comm_hook,
+        ]:
+            grad_dict_w_hook = get_grads_after_one_iteration(grad_hook=hook)
+            if dist.get_rank() == 0:
+                for name in grad_dict:
+                    assert_close(grad_dict[name], grad_dict_w_hook[name], rtol=0.1, atol=0.1)
+    elif mode == "params":
+        grad_dict = get_grads_after_one_iteration()
+        for hook in [
+            fp8_compress_fsdp_params_comm_hook,
+        ]:
+            grad_dict_w_hook = get_grads_after_one_iteration(params_hook=hook)
+            if dist.get_rank() == 0:
+                for name in grad_dict:
+                    assert_close(grad_dict[name], grad_dict_w_hook[name], rtol=0.1, atol=0.1)
+    else:
+        raise NotImplementedError
+
+
+def demo_basic(rank, world_size, port):
+    print(f"Running basic FSDP example on rank {rank}.")
+    launch(rank=rank, world_size=world_size, port=port, host="localhost")
+    run_model()
+    cleanup()
+
+
+@pytest.mark.skipif(version.parse(torch.__version__) < version.parse("2.2.0"), reason="torch version < 2.2.0.")
+@rerun_if_address_is_in_use()
+def test_fsdp():
+    n_gpus = torch.cuda.device_count()
+    assert n_gpus >= 2, f"Requires at least 2 GPUs to run, but got {n_gpus}"
+    spawn(demo_basic, n_gpus)
+
+
+if __name__ == "__main__":
+    test_fsdp()

From 4b9bec81766f0a536e1c8317cba38765903ec93a Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Thu, 8 Aug 2024 17:19:21 +0800
Subject: [PATCH 022/167] [test ci]Feature/fp8 comm (#5981)

* fix

* fix

* fix
---
 .github/workflows/example_check_on_pr.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/example_check_on_pr.yml b/.github/workflows/example_check_on_pr.yml
index 56fa006b1633..7a906738cb96 100644
--- a/.github/workflows/example_check_on_pr.yml
+++ b/.github/workflows/example_check_on_pr.yml
@@ -9,6 +9,7 @@ on:
     paths:
       - "examples/**"
       - "!examples/**.md"
+      - ".github/workflows/example_check_on_pr.yml"
 
 jobs:
   # This is for changed example files detect and output a matrix containing all the corresponding directory name.
@@ -107,7 +108,7 @@ jobs:
 
       - name: Install Colossal-AI
         run: |
-          BUILD_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v -e .
 
       - name: Store Colossal-AI Cache
         run: |

From 8241c0c054b38a109ed3ce7be1052a1e600b8471 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Fri, 9 Aug 2024 14:09:48 +0800
Subject: [PATCH 023/167] [fp8] support gemini plugin (#5978)

* [fp8] refactor hook

* [fp8] support gemini plugin

* [example] add fp8 option for llama benchmark
---
 colossalai/booster/plugin/gemini_plugin.py            |  2 ++
 colossalai/booster/plugin/hybrid_parallel_plugin.py   |  2 +-
 colossalai/quantization/fp8.py                        |  4 ++--
 .../{booster/plugin => quantization}/fp8_hook.py      |  0
 colossalai/zero/gemini/gemini_ddp.py                  | 11 ++++++++---
 examples/language/llama/benchmark.py                  |  7 +++++++
 tests/test_fp8/test_fp8_hook.py                       |  2 +-
 7 files changed, 21 insertions(+), 7 deletions(-)
 rename colossalai/{booster/plugin => quantization}/fp8_hook.py (100%)

diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py
index 5ab8f05adc85..d0d3275cfab9 100644
--- a/colossalai/booster/plugin/gemini_plugin.py
+++ b/colossalai/booster/plugin/gemini_plugin.py
@@ -363,6 +363,7 @@ def __init__(
         enable_jit_fused: bool = False,
         enable_sequence_overlap: bool = False,
         enable_async_reduce: bool = True,
+        use_fp8: bool = False,
         verbose: bool = False,
         fp8_communication: bool = False,
     ) -> None:
@@ -397,6 +398,7 @@ def __init__(
             max_prefetch=max_prefetch,
             enable_async_reduce=enable_async_reduce,
             fp8_communication=fp8_communication,
+            use_fp8=use_fp8,
         )
         self.zero_optim_config = dict(
             gpu_margin_mem_ratio=gpu_margin_mem_ratio,
diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index f585abdb5d04..9c0ee9e501ae 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -31,6 +31,7 @@
 from colossalai.pipeline.schedule import InterleavedSchedule, OneForwardOneBackwardSchedule
 from colossalai.pipeline.stage_manager import PipelineStageManager
 from colossalai.quantization import BnbQuantizationConfig, quantize_model
+from colossalai.quantization.fp8_hook import FP8Hook
 from colossalai.shardformer import GradientCheckpointConfig, ShardConfig, ShardFormer
 from colossalai.shardformer.layer.utils import SeqParallelUtils
 from colossalai.shardformer.policies.base_policy import Policy
@@ -40,7 +41,6 @@
 from colossalai.zero.low_level import LowLevelZeroOptimizer
 from colossalai.zero.low_level.zero_hook import ZeroOpHook, wait_all_gather_handle
 
-from .fp8_hook import FP8Hook
 from .pp_plugin_base import PipelinePluginBase
 
 SUPPORT_SP_MODE = ["split_gather", "ring", "all_to_all"]
diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index e933680a931c..53febd16c8f6 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -652,5 +652,5 @@ def backward(ctx: Any, out_grad) -> Any:
         return x_grad.reshape(ctx.x_shape), w_grad, bias_grad
 
 
-def linear_fp8(x: torch.Tensor, w: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-    return _LinearFp8.apply(x, w, bias)
+def linear_fp8(input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    return _LinearFp8.apply(input, weight, bias)
diff --git a/colossalai/booster/plugin/fp8_hook.py b/colossalai/quantization/fp8_hook.py
similarity index 100%
rename from colossalai/booster/plugin/fp8_hook.py
rename to colossalai/quantization/fp8_hook.py
diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py
index 0b2039a4de0c..dbaae66108fe 100644
--- a/colossalai/zero/gemini/gemini_ddp.py
+++ b/colossalai/zero/gemini/gemini_ddp.py
@@ -15,6 +15,7 @@
 from colossalai.interface import ModelWrapper
 from colossalai.lazy import LazyTensor
 from colossalai.logging import get_dist_logger
+from colossalai.quantization.fp8_hook import FP8Hook
 from colossalai.tensor.colo_parameter import ColoParameter
 from colossalai.tensor.d_tensor import (
     distribute_tensor,
@@ -99,6 +100,7 @@ def __init__(
         verbose: bool = False,
         enable_async_reduce: bool = True,
         fp8_communication: bool = False,
+        use_fp8: bool = False,
     ) -> None:
         assert mixed_precision in (torch.float16, torch.bfloat16)
         reuse_fp16_chunk = master_weights if not enable_gradient_accumulation else False
@@ -138,6 +140,9 @@ def __init__(
         )
         self.force_outputs_fp32 = force_outputs_fp32
         self.param_op_hook = GeminiZeROHook(self.gemini_manager)
+        self.hooks = [self.param_op_hook]
+        if use_fp8:
+            self.hooks.append(FP8Hook())
         self.fp32_params: List[torch.Tensor] = list()
         self.fp16_params: List[ColoParameter] = list()
         self.grads_device: Dict[torch.Tensor, torch.device] = dict()
@@ -310,7 +315,7 @@ def forward(self, *args, **kwargs):
             outputs = self._inference_forward(*args, **kwargs)
         else:
             self.gemini_manager.pre_iter(*args)
-            with ColoParamOpHookManager.use_hooks(self.param_op_hook):
+            with ColoParamOpHookManager.use_hooks(*self.hooks):
                 outputs = self.module(*args, **kwargs)
 
         if self.force_outputs_fp32:
@@ -319,7 +324,7 @@ def forward(self, *args, **kwargs):
 
     def _inference_forward(self, *args, **kwargs):
         """This function is only triggered for inference."""
-        fwd_ctx = ColoParamOpHookManager.use_hooks(self.param_op_hook)
+        fwd_ctx = ColoParamOpHookManager.use_hooks(*self.hooks)
         if not self.scatter_after_inference:
             # gather all chunks
             for chunk in self.chunk_manager.get_chunks(self.fp16_params):
@@ -372,7 +377,7 @@ def _post_backward(self):
 
     def backward(self, loss: torch.Tensor):
         self._pre_backward()
-        with self.param_op_hook.switch_to_backward(), ColoParamOpHookManager.use_hooks(self.param_op_hook):
+        with self.param_op_hook.switch_to_backward(), ColoParamOpHookManager.use_hooks(*self.hooks):
             loss.backward()
         self._post_backward()
 
diff --git a/examples/language/llama/benchmark.py b/examples/language/llama/benchmark.py
index 2bd9671d81b7..07583161b6fb 100644
--- a/examples/language/llama/benchmark.py
+++ b/examples/language/llama/benchmark.py
@@ -99,6 +99,8 @@ def main():
     parser.add_argument("--prefetch_num", type=int, default=0, help="chunk prefetch max number")
     parser.add_argument("--no_cache", action="store_true")
     parser.add_argument("--use_fp8_comm", action="store_true", default=False, help="for using fp8 during communication")
+    parser.add_argument("--overlap_allgather", action="store_true")
+    parser.add_argument("--use_fp8", action="store_true")
     args = parser.parse_args()
 
     colossalai.launch_from_torch()
@@ -136,6 +138,7 @@ def empty_init():
             enable_flash_attention=args.xformers,
             max_prefetch=args.prefetch_num,
             enable_async_reduce=not args.disable_async_reduce,
+            use_fp8=args.use_fp8,
         )
     elif args.plugin == "gemini_auto":
         plugin = GeminiPlugin(
@@ -148,6 +151,7 @@ def empty_init():
             max_prefetch=args.prefetch_num,
             enable_async_reduce=not args.disable_async_reduce,
             enable_flash_attention=args.xformers,
+            use_fp8=args.use_fp8,
         )
     elif args.plugin == "fsdp":
         if use_empty_init:
@@ -207,6 +211,8 @@ def empty_init():
             dp_outside=False,
             overlap_p2p=args.overlap,
             enable_metadata_cache=not args.no_cache,
+            overlap_allgather=args.overlap_allgather,
+            use_fp8=args.use_fp8,
             **hybrid_kwargs,
         )
     elif args.plugin == "3d_cpu":
@@ -223,6 +229,7 @@ def empty_init():
             initial_scale=2**8,
             precision="bf16",
             overlap_p2p=args.overlap,
+            use_fp8=args.use_fp8,
         )
     else:
         raise ValueError(f"Unknown plugin {args.plugin}")
diff --git a/tests/test_fp8/test_fp8_hook.py b/tests/test_fp8/test_fp8_hook.py
index 6cc147be72b7..abd5d09e128e 100644
--- a/tests/test_fp8/test_fp8_hook.py
+++ b/tests/test_fp8/test_fp8_hook.py
@@ -4,8 +4,8 @@
 import torch.nn.functional as F
 
 from colossalai.accelerator import get_accelerator
-from colossalai.booster.plugin.fp8_hook import FP8Hook
 from colossalai.quantization.fp8 import linear_fp8
+from colossalai.quantization.fp8_hook import FP8Hook
 from colossalai.tensor.colo_parameter import ColoParameter
 from colossalai.tensor.param_op_hook import ColoParamOpHookManager
 from colossalai.utils import get_current_device

From e4aadeee20b248dd68b5aa7447b8eb1f127a259b Mon Sep 17 00:00:00 2001
From: botbw <wang1570@e.ntu.edu.sg>
Date: Fri, 9 Aug 2024 15:51:06 +0800
Subject: [PATCH 024/167] [fp8] use torch compile (torch >= 2.3.0) (#5979)

* [fp8] use torch compile (torch >= 2.4.0)

* [fp8] set use_fast_accum in linear

* [chore] formal version check

* [chore] fix sig
---
 colossalai/quantization/fp8.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index 53febd16c8f6..cfbf1fcf7e40 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -1,13 +1,14 @@
-from typing import Any, Optional
+from typing import Any, Optional, Tuple
 
 import numpy as np
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
+from packaging.version import Version
 from torch.distributed import ReduceOp
 
 
-def cast_to_fp8(inp: torch.Tensor, fp8_format="e4m3", per_channel_scale=False) -> (torch.Tensor, torch.Tensor):
+def cast_to_fp8(inp: torch.Tensor, fp8_format="e4m3", per_channel_scale=False) -> Tuple[torch.Tensor, torch.Tensor]:
     r"""
     casting torch Tensor into specified fp8 tensor with per-channel scaling or per-tensor scaling.
     Args:
@@ -624,7 +625,13 @@ def forward(
         ctx.inv_scale_x = inv_scale_x
         ctx.inv_scale_w = inv_scale_w
         out = torch._scaled_mm(
-            x_fp8, ctx.w_fp8_t, bias=bias, out_dtype=ctx.out_dtype, scale_a=inv_scale_x, scale_b=inv_scale_w
+            x_fp8,
+            ctx.w_fp8_t,
+            bias=bias,
+            out_dtype=ctx.out_dtype,
+            scale_a=inv_scale_x,
+            scale_b=inv_scale_w,
+            use_fast_accum=True,
         )[0]
         return out.reshape(*ctx.x_shape[:-1], w.shape[0])
 
@@ -638,6 +645,7 @@ def backward(ctx: Any, out_grad) -> Any:
             out_dtype=ctx.out_dtype,
             scale_a=out_grad_scale,
             scale_b=ctx.inv_scale_w,
+            use_fast_accum=True,
         )[0]
         w_grad = torch._scaled_mm(
             out_grad_fp8.t().contiguous(),
@@ -645,6 +653,7 @@ def backward(ctx: Any, out_grad) -> Any:
             out_dtype=ctx.out_dtype,
             scale_a=out_grad_scale,
             scale_b=ctx.inv_scale_x,
+            use_fast_accum=True,
         )[0]
         bias_grad = None
         if ctx.has_bias:
@@ -652,5 +661,13 @@ def backward(ctx: Any, out_grad) -> Any:
         return x_grad.reshape(ctx.x_shape), w_grad, bias_grad
 
 
-def linear_fp8(input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-    return _LinearFp8.apply(input, weight, bias)
+if Version(torch.__version__) >= Version("2.3.0"):  # TODO failed on torch < 2.3.0
+
+    @torch.compile(mode="reduce-overhead", fullgraph=True)
+    def linear_fp8(input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return _LinearFp8.apply(input, weight, bias)
+
+else:
+
+    def linear_fp8(input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return _LinearFp8.apply(input, weight, bias)

From f1a3a326c47e48819be35f2d760192cea3e447c5 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Fri, 9 Aug 2024 18:26:02 +0800
Subject: [PATCH 025/167] [fp8]Moe support fp8 communication (#5977)

* fix

* support moe fp8

* fix

* fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix

* fix

* fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix

* fix

* fix

fix

fi

* fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../plugin/moe_hybrid_parallel_plugin.py      |  2 +
 colossalai/moe/_operation.py                  | 23 +++++--
 colossalai/quantization/fp8.py                | 23 ++++---
 colossalai/shardformer/layer/embedding.py     |  4 +-
 colossalai/shardformer/modeling/deepseek.py   | 69 ++++++++++++++-----
 colossalai/shardformer/modeling/mixtral.py    | 59 ++++++++++++----
 colossalai/shardformer/policies/deepseek.py   | 12 +++-
 colossalai/shardformer/policies/mixtral.py    | 20 ++++--
 8 files changed, 160 insertions(+), 52 deletions(-)

diff --git a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
index b3415af0eed6..ca3a68373184 100644
--- a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
@@ -214,6 +214,7 @@ def __init__(
         moe_dp_outside: bool = True,
         overlap_p2p: bool = True,
         overlap_allgather: bool = False,
+        fp8_communication: bool = False,
     ) -> None:
         if overlap_communication or zero_stage == 2:
             overlap_communication = False
@@ -341,6 +342,7 @@ def __init__(
             parallel_output=parallel_output,
             make_vocab_size_divisible_by=make_vocab_size_divisible_by,
             gradient_checkpoint_config=gradient_checkpoint_config,
+            fp8_communication=fp8_communication,
         )
         self.amp_config = dict(
             initial_scale=initial_scale,
diff --git a/colossalai/moe/_operation.py b/colossalai/moe/_operation.py
index ac422a4da98f..ba087a03b728 100644
--- a/colossalai/moe/_operation.py
+++ b/colossalai/moe/_operation.py
@@ -6,6 +6,8 @@
 from torch.cuda.amp import custom_bwd, custom_fwd
 from torch.distributed import ProcessGroup
 
+from colossalai.quantization.fp8 import all_to_all_single_fp8
+
 MOE_KERNEL = None
 
 
@@ -380,6 +382,7 @@ def _all_to_all(
     output_split_sizes: Optional[List[int]] = None,
     group=None,
     async_op: bool = False,
+    fp8_communication: bool = False,
 ):
     """
     Returns:
@@ -392,9 +395,14 @@ def _all_to_all(
     outputs = torch.empty(outputs_shape, dtype=inputs.dtype, device=inputs.device)
     inputs = inputs.contiguous()
     outputs = outputs.contiguous()
-    handle = dist.all_to_all_single(
-        outputs, inputs, output_split_sizes, input_split_sizes, group=group, async_op=async_op
-    )
+    if fp8_communication:
+        handle = all_to_all_single_fp8(
+            outputs, inputs, output_split_sizes, input_split_sizes, group=group, async_op=False
+        )
+    else:
+        handle = dist.all_to_all_single(
+            outputs, inputs, output_split_sizes, input_split_sizes, group=group, async_op=async_op
+        )
     return outputs, handle
 
 
@@ -407,6 +415,7 @@ def forward(
         output_split_sizes=None,
         group=None,
         overlap: bool = False,
+        fp8_communication: bool = False,
     ):
         """
         Returns:
@@ -416,7 +425,9 @@ def forward(
         ctx.input_split_sizes = input_split_sizes
         ctx.output_split_sizes = output_split_sizes
         ctx.group = group
-        return _all_to_all(inputs, input_split_sizes, output_split_sizes, group, overlap)
+        return _all_to_all(
+            inputs, input_split_sizes, output_split_sizes, group, overlap, fp8_communication=fp8_communication
+        )
 
     @staticmethod
     def backward(ctx: Any, *grad_outputs):
@@ -426,6 +437,7 @@ def backward(ctx: Any, *grad_outputs):
             None,
             None,
             None,
+            None,
         )
 
 
@@ -435,8 +447,9 @@ def all_to_all_uneven(
     output_split_sizes: Optional[List[int]] = None,
     group=None,
     overlap: bool = False,
+    fp8_communication: bool = False,
 ):
     assert (
         inputs.requires_grad
     ), "Input must require grad to assure that backward is executed, otherwise it might hang the program."
-    return AllToAllUneven.apply(inputs, input_split_sizes, output_split_sizes, group, overlap)
+    return AllToAllUneven.apply(inputs, input_split_sizes, output_split_sizes, group, overlap, fp8_communication)
diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index cfbf1fcf7e40..f2bffa09f15d 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -27,16 +27,19 @@ def cast_to_fp8(inp: torch.Tensor, fp8_format="e4m3", per_channel_scale=False) -
     fp8_type = torch.float8_e4m3fn if fp8_format == "e4m3" else torch.float8_e5m2
     fp8_max = torch.finfo(fp8_type).max
 
-    if per_channel_scale:
-        per_channel_max = inp.abs().max(dim=-1).values.float()
-        per_channel_max = torch.where(per_channel_max > 0, per_channel_max, 1.0)
-        scale = fp8_max / per_channel_max[:, None]
-        scale_inv = per_channel_max / fp8_max
+    if inp.numel() == 0:
+        return inp.to(fp8_type), torch.tensor([1.0], device=inp.device)
     else:
-        per_tensor_max = inp.abs().max().float()
-        per_tensor_max = torch.where(per_tensor_max > 0, per_tensor_max, 1.0)
-        scale = fp8_max / per_tensor_max
-        scale_inv = 1.0 / scale
+        if per_channel_scale:
+            per_channel_max = inp.abs().max(dim=-1).values.float()
+            per_channel_max = torch.where(per_channel_max > 0, per_channel_max, 1.0)
+            scale = fp8_max / per_channel_max[:, None]
+            scale_inv = per_channel_max / fp8_max
+        else:
+            per_tensor_max = inp.abs().max().float()
+            per_tensor_max = torch.where(per_tensor_max > 0, per_tensor_max, 1.0)
+            scale = fp8_max / per_tensor_max
+            scale_inv = 1.0 / scale
 
     ret = (scale * inp.float()).to(fp8_type)
     return ret, scale_inv
@@ -113,7 +116,7 @@ def all_reduce_fp8(tensor: torch.Tensor, fp8_format="e4m3", op=ReduceOp.SUM, gro
     tensor_list = [torch.empty_like(summed_out_fp8.view(torch.uint8)) for _ in range(world_size)]
     dist.all_gather(tensor_list, summed_out_fp8.view(torch.uint8), group=group)
     for i in range(world_size):
-        tensor_list[i] = tensor_list[i].view(fp8_type).to(input_type) * scale_list[i]
+        tensor_list[i] = tensor_list[i].view(fp8_type).to(input_type) * scale_list[i].to(input_device)
     out = torch.cat(tensor_list, dim=0)
     tensor.copy_(out[:input_size].view(input_shape).to(input_type))
 
diff --git a/colossalai/shardformer/layer/embedding.py b/colossalai/shardformer/layer/embedding.py
index 9b77774aaeaa..186063503fd4 100644
--- a/colossalai/shardformer/layer/embedding.py
+++ b/colossalai/shardformer/layer/embedding.py
@@ -274,6 +274,7 @@ def __init__(
         weight: Optional[nn.Parameter] = None,
         weight_initializer: Callable = init.normal_(),
         make_vocab_size_divisible_by: int = 64,
+        fp8_communication: bool = False,
         *args,
         **kwargs,
     ):
@@ -282,6 +283,7 @@ def __init__(
         self.embed_args = args
         self.embed_kwargs = kwargs
         self.process_group = process_group
+        self.fp8_communication = fp8_communication
 
         tensor_parallel_size = dist.get_world_size(group=process_group)
         tensor_parallel_rank = dist.get_rank(group=process_group)
@@ -390,5 +392,5 @@ def forward(self, input_: Tensor) -> Tensor:
         embedding_output = output_parallel.clone()
         embedding_output[input_mask, :] = 0.0
         # Reduce across all the model parallel GPUs.
-        output = reduce_forward(embedding_output, self.process_group)
+        output = reduce_forward(embedding_output, self.process_group, fp8_communication=self.fp8_communication)
         return output
diff --git a/colossalai/shardformer/modeling/deepseek.py b/colossalai/shardformer/modeling/deepseek.py
index a84a3097231a..48a629705a7c 100644
--- a/colossalai/shardformer/modeling/deepseek.py
+++ b/colossalai/shardformer/modeling/deepseek.py
@@ -24,6 +24,7 @@
     all_to_all_uneven,
 )
 from colossalai.pipeline.stage_manager import PipelineStageManager
+from colossalai.quantization.fp8 import all_reduce_fp8
 from colossalai.shardformer.layer._operation import (
     all_to_all_comm,
     gather_forward_split_backward,
@@ -61,7 +62,13 @@ class EPDeepseekMoE(nn.Module):
     def __init__(self):
         raise RuntimeError(f"Please use `from_native_module` to create an instance of {self.__class__.__name__}")
 
-    def setup_process_groups(self, tp_group: ProcessGroup, moe_dp_group: ProcessGroup, ep_group: ProcessGroup):
+    def setup_process_groups(
+        self,
+        tp_group: ProcessGroup,
+        moe_dp_group: ProcessGroup,
+        ep_group: ProcessGroup,
+        fp8_communication: bool = False,
+    ):
         assert tp_group is not None
         assert moe_dp_group is not None
         assert ep_group is not None
@@ -70,6 +77,7 @@ def setup_process_groups(self, tp_group: ProcessGroup, moe_dp_group: ProcessGrou
         self.ep_rank = dist.get_rank(ep_group)
         self.num_experts = self.config.n_routed_experts
         assert self.num_experts % self.ep_size == 0
+        self.fp8_communication = fp8_communication
 
         self.ep_group = ep_group
         self.num_experts_per_ep = self.num_experts // self.ep_size
@@ -86,9 +94,15 @@ def setup_process_groups(self, tp_group: ProcessGroup, moe_dp_group: ProcessGrou
         self.tp_group = tp_group
         if self.tp_group.size() > 1:
             for expert in held_experts:
-                expert.gate_proj = Linear1D_Col.from_native_module(expert.gate_proj, self.tp_group)
-                expert.up_proj = Linear1D_Col.from_native_module(expert.up_proj, self.tp_group)
-                expert.down_proj = Linear1D_Row.from_native_module(expert.down_proj, self.tp_group)
+                expert.gate_proj = Linear1D_Col.from_native_module(
+                    expert.gate_proj, self.tp_group, fp8_communication=self.fp8_communication
+                )
+                expert.up_proj = Linear1D_Col.from_native_module(
+                    expert.up_proj, self.tp_group, fp8_communication=self.fp8_communication
+                )
+                expert.down_proj = Linear1D_Row.from_native_module(
+                    expert.down_proj, self.tp_group, fp8_communication=self.fp8_communication
+                )
 
         for p in self.experts.parameters():
             set_moe_tensor_ep_group(p, ep_group)
@@ -106,7 +120,8 @@ def from_native_module(
         if module.__class__.__name__ == "DeepseekMLP":
             return module
         module.__class__ = EPDeepseekMoE
-        module.setup_process_groups(tp_group, moe_dp_group, ep_group)
+        fp8_communication = kwargs.get("fp8_communication", False)
+        module.setup_process_groups(tp_group, moe_dp_group, ep_group, fp8_communication=fp8_communication)
         return module
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -137,11 +152,21 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             for i in range(1, self.ep_size):
                 activate_experts += output_split_sizes[i * self.num_experts_per_ep : (i + 1) * self.num_experts_per_ep]
             activate_experts = (activate_experts > 0).float()
-        dist.all_reduce(activate_experts, group=self.moe_dp_group)
+
+        if self.fp8_communication:
+            all_reduce_fp8(activate_experts, group=self.moe_dp_group)
+        else:
+            dist.all_reduce(activate_experts, group=self.moe_dp_group)
 
         input_split_list = input_split_sizes.view(self.ep_size, self.num_experts_per_ep).sum(dim=-1).tolist()
         output_split_list = output_split_sizes.view(self.ep_size, self.num_experts_per_ep).sum(dim=-1).tolist()
-        output_states, _ = all_to_all_uneven(dispatch_states, input_split_list, output_split_list, self.ep_group)
+        output_states, _ = all_to_all_uneven(
+            dispatch_states,
+            input_split_list,
+            output_split_list,
+            self.ep_group,
+            fp8_communication=self.fp8_communication,
+        )
         output_states = EPGradScalerIn.apply(output_states, self.ep_size)
 
         if output_states.size(0) > 0:
@@ -167,7 +192,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                     output_states_list.append(split_states)
                 output_states = torch.cat(output_states_list)
         output_states = EPGradScalerOut.apply(output_states, self.ep_size)
-        dispatch_states, _ = all_to_all_uneven(output_states, output_split_list, input_split_list, self.ep_group)
+        dispatch_states, _ = all_to_all_uneven(
+            output_states, output_split_list, input_split_list, self.ep_group, fp8_communication=self.fp8_communication
+        )
         recover_token_idx = torch.empty_like(flat_topk_token_idx)
         recover_token_idx[flat_topk_token_idx] = torch.arange(
             flat_topk_token_idx.size(0), device=flat_topk_token_idx.device
@@ -534,9 +561,9 @@ def forward(
 
         # sp: all-to-all comminucation when introducing sequence parallel
         if sp_mode == "all_to_all":
-            query_states = all_to_all_comm(query_states, sp_group)
-            key_states = all_to_all_comm(key_states, sp_group)
-            value_states = all_to_all_comm(value_states, sp_group)
+            query_states = all_to_all_comm(query_states, sp_group, fp8_communication=shard_config.fp8_communication)
+            key_states = all_to_all_comm(key_states, sp_group, fp8_communication=shard_config.fp8_communication)
+            value_states = all_to_all_comm(value_states, sp_group, fp8_communication=shard_config.fp8_communication)
             bsz, q_len, _ = query_states.size()
         # Flash attention requires the input to have the shape
         # batch_size x seq_length x head_dim x hidden_dim
@@ -595,7 +622,9 @@ def forward(
         # sp: all-to-all comminucation when introducing sequence parallel
         if sp_mode == "all_to_all":
             attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim).contiguous()  # (1, 8, 128)
-            attn_output = all_to_all_comm(attn_output, sp_group, scatter_dim=1, gather_dim=2)  # (1, 4, 256)
+            attn_output = all_to_all_comm(
+                attn_output, sp_group, scatter_dim=1, gather_dim=2, fp8_communication=shard_config.fp8_communication
+            )  # (1, 4, 256)
         else:
             attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
 
@@ -685,9 +714,13 @@ def forward(
             )
 
         if sp_mode in ["ring", "split_gather"]:
-            inputs_embeds = split_forward_gather_backward(inputs_embeds, 1, sp_group)
+            inputs_embeds = split_forward_gather_backward(
+                inputs_embeds, 1, sp_group, fp8_communication=shard_config.fp8_communication
+            )
         elif sp_mode == "all_to_all":
-            inputs_embeds = split_forward_gather_backward(inputs_embeds, 1, sp_group, 1 / sp_size)
+            inputs_embeds = split_forward_gather_backward(
+                inputs_embeds, 1, sp_group, 1 / sp_size, fp8_communication=shard_config.fp8_communication
+            )
         # embed positions
         hidden_states = inputs_embeds
 
@@ -731,9 +764,13 @@ def forward(
         hidden_states = self.norm(hidden_states)
 
         if sp_mode == "ring" or sp_mode == "split_gather":
-            hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group)
+            hidden_states = gather_forward_split_backward(
+                hidden_states, 1, sp_group, fp8_communication=shard_config.fp8_communication
+            )
         elif sp_mode == "all_to_all":
-            hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group, grad_scale=sp_size)
+            hidden_states = gather_forward_split_backward(
+                hidden_states, 1, sp_group, grad_scale=sp_size, fp8_communication=shard_config.fp8_communication
+            )
 
         # add hidden states from the last decoder layer
         if output_hidden_states:
diff --git a/colossalai/shardformer/modeling/mixtral.py b/colossalai/shardformer/modeling/mixtral.py
index d30ce5ea85cc..76b441fe4258 100644
--- a/colossalai/shardformer/modeling/mixtral.py
+++ b/colossalai/shardformer/modeling/mixtral.py
@@ -53,7 +53,13 @@ class EPMixtralSparseMoeBlock(MixtralSparseMoeBlock):
     def __init__(self, *args, **kwargs):
         raise RuntimeError(f"Please use `from_native_module` to create an instance of {self.__class__.__name__}")
 
-    def setup_process_groups(self, tp_group: ProcessGroup, moe_dp_group: ProcessGroup, ep_group: ProcessGroup):
+    def setup_process_groups(
+        self,
+        tp_group: ProcessGroup,
+        moe_dp_group: ProcessGroup,
+        ep_group: ProcessGroup,
+        fp8_communication: bool = False,
+    ):
         assert tp_group is not None
         assert moe_dp_group is not None
         assert ep_group is not None
@@ -62,6 +68,7 @@ def setup_process_groups(self, tp_group: ProcessGroup, moe_dp_group: ProcessGrou
         self.ep_size = dist.get_world_size(ep_group)
         self.ep_rank = dist.get_rank(ep_group)
         self.ep_group = ep_group
+        self.fp8_communication = fp8_communication
 
         if self.num_experts % self.ep_size != 0:
             raise ValueError("The number of experts must be divisible by the number of expert parallel groups.")
@@ -80,9 +87,15 @@ def setup_process_groups(self, tp_group: ProcessGroup, moe_dp_group: ProcessGrou
         self.tp_group = tp_group
         if self.tp_group.size() > 1:
             for expert in held_experts:
-                expert.w1 = Linear1D_Col.from_native_module(expert.w1, self.tp_group)
-                expert.w3 = Linear1D_Col.from_native_module(expert.w3, self.tp_group)
-                expert.w2 = Linear1D_Row.from_native_module(expert.w2, self.tp_group)
+                expert.w1 = Linear1D_Col.from_native_module(
+                    expert.w1, self.tp_group, fp8_communication=self.fp8_communication
+                )
+                expert.w3 = Linear1D_Col.from_native_module(
+                    expert.w3, self.tp_group, fp8_communication=self.fp8_communication
+                )
+                expert.w2 = Linear1D_Row.from_native_module(
+                    expert.w2, self.tp_group, fp8_communication=self.fp8_communication
+                )
 
         for p in self.experts.parameters():
             set_moe_tensor_ep_group(p, ep_group)
@@ -99,7 +112,8 @@ def from_native_module(
         # TODO: better init
         LazyInitContext.materialize(module)
         module.__class__ = EPMixtralSparseMoeBlock
-        module.setup_process_groups(tp_group, moe_dp_group, ep_group)
+        fp8_communication = kwargs.get("fp8_communication", False)
+        module.setup_process_groups(tp_group, moe_dp_group, ep_group, fp8_communication)
         return module
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -120,6 +134,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         input_split_sizes = selected_experts.bincount(minlength=self.num_experts)
 
         output_split_sizes = torch.zeros_like(input_split_sizes)
+
         dist.all_to_all_single(output_split_sizes, input_split_sizes, group=self.ep_group)
 
         with torch.no_grad():
@@ -132,7 +147,13 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         input_split_list = input_split_sizes.view(self.ep_size, self.num_experts_per_ep).sum(dim=-1).tolist()
         output_split_list = output_split_sizes.view(self.ep_size, self.num_experts_per_ep).sum(dim=-1).tolist()
 
-        output_states, _ = all_to_all_uneven(dispatch_states, input_split_list, output_split_list, self.ep_group)
+        output_states, _ = all_to_all_uneven(
+            dispatch_states,
+            input_split_list,
+            output_split_list,
+            self.ep_group,
+            fp8_communication=self.fp8_communication,
+        )
         # compute expert output
         output_states = EPGradScalerIn.apply(output_states, self.ep_size)
         if output_states.size(0) > 0:
@@ -162,7 +183,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
                 output_states = torch.cat(output_states_list)
 
         output_states = EPGradScalerOut.apply(output_states, self.ep_size)
-        dispatch_states, _ = all_to_all_uneven(output_states, output_split_list, input_split_list, self.ep_group)
+        dispatch_states, _ = all_to_all_uneven(
+            output_states, output_split_list, input_split_list, self.ep_group, fp8_communication=self.fp8_communication
+        )
 
         recover_experts_idx = torch.empty_like(selected_experts_idx)
         recover_experts_idx[selected_experts_idx] = torch.arange(
@@ -566,9 +589,9 @@ def forward(
 
         # sp: all-to-all comminucation when introducing sequence parallel
         if sp_mode == "all_to_all":
-            query_states = all_to_all_comm(query_states, sp_group)
-            key_states = all_to_all_comm(key_states, sp_group)
-            value_states = all_to_all_comm(value_states, sp_group)
+            query_states = all_to_all_comm(query_states, sp_group, fp8_communication=shard_config.fp8_communication)
+            key_states = all_to_all_comm(key_states, sp_group, fp8_communication=shard_config.fp8_communication)
+            value_states = all_to_all_comm(value_states, sp_group, fp8_communication=shard_config.fp8_communication)
             bsz, q_len, _ = query_states.size()
 
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
@@ -780,9 +803,13 @@ def forward(
             )
 
         if sp_mode in ["ring", "split_gather"]:
-            inputs_embeds = split_forward_gather_backward(inputs_embeds, 1, sp_group)
+            inputs_embeds = split_forward_gather_backward(
+                inputs_embeds, 1, sp_group, fp8_communication=shard_config.fp8_communication
+            )
         elif sp_mode == "all_to_all":
-            inputs_embeds = split_forward_gather_backward(inputs_embeds, 1, sp_group, 1 / sp_size)
+            inputs_embeds = split_forward_gather_backward(
+                inputs_embeds, 1, sp_group, 1 / sp_size, fp8_communication=shard_config.fp8_communication
+            )
         hidden_states = inputs_embeds
 
         # decoder layers
@@ -831,9 +858,13 @@ def forward(
         hidden_states = self.norm(hidden_states)
 
         if sp_mode == "ring" or sp_mode == "split_gather":
-            hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group)
+            hidden_states = gather_forward_split_backward(
+                hidden_states, 1, sp_group, fp8_communication=shard_config.fp8_communication
+            )
         elif sp_mode == "all_to_all":
-            hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group, grad_scale=sp_size)
+            hidden_states = gather_forward_split_backward(
+                hidden_states, 1, sp_group, grad_scale=sp_size, fp8_communication=shard_config.fp8_communication
+            )
 
         # add hidden states from the last decoder layer
         if output_hidden_states:
diff --git a/colossalai/shardformer/policies/deepseek.py b/colossalai/shardformer/policies/deepseek.py
index 605f69c4a632..0b8a602d1164 100644
--- a/colossalai/shardformer/policies/deepseek.py
+++ b/colossalai/shardformer/policies/deepseek.py
@@ -118,18 +118,22 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                     SubModuleReplacementDescription(
                         suffix="self_attn.q_proj",
                         target_module=Linear1D_Col,
+                        kwargs={"fp8_communication": self.shard_config.fp8_communication},
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.k_proj",
                         target_module=Linear1D_Col,
+                        kwargs={"fp8_communication": self.shard_config.fp8_communication},
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.v_proj",
                         target_module=Linear1D_Col,
+                        kwargs={"fp8_communication": self.shard_config.fp8_communication},
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.o_proj",
                         target_module=Linear1D_Row,
+                        kwargs={"fp8_communication": self.shard_config.fp8_communication},
                     ),
                 ],
             )
@@ -138,7 +142,10 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                 description=SubModuleReplacementDescription(
                     suffix="embed_tokens",
                     target_module=embedding_cls,
-                    kwargs={"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by},
+                    kwargs={
+                        "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                        "fp8_communication": self.shard_config.fp8_communication,
+                    },
                 ),
                 policy=policy,
                 target_key="DeepseekModel",
@@ -155,6 +162,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                             "ep_group": self.shard_config.ep_group,
                             "tp_group": self.shard_config.tensor_parallel_process_group,
                             "moe_dp_group": self.shard_config.moe_dp_group,
+                            "fp8_communication": self.shard_config.fp8_communication,
                         },
                     )
                 ],
@@ -305,7 +313,7 @@ def module_policy(self):
                         SubModuleReplacementDescription(
                             suffix="lm_head",
                             target_module=Linear1D_Col,
-                            kwargs=dict(gather_output=True),
+                            kwargs=dict(gather_output=True, fp8_communication=self.shard_config.fp8_communication),
                         )
                     ]
                 )
diff --git a/colossalai/shardformer/policies/mixtral.py b/colossalai/shardformer/policies/mixtral.py
index 10df143c99da..550b7f2360f5 100644
--- a/colossalai/shardformer/policies/mixtral.py
+++ b/colossalai/shardformer/policies/mixtral.py
@@ -114,21 +114,27 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                     SubModuleReplacementDescription(
                         suffix="self_attn.q_proj",
                         target_module=Linear1D_Col,
+                        kwargs={"fp8_communication": self.shard_config.fp8_communication},
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.k_proj",
                         target_module=Linear1D_Col,
+                        kwargs={"fp8_communication": self.shard_config.fp8_communication},
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.v_proj",
                         target_module=Linear1D_Col,
+                        kwargs={"fp8_communication": self.shard_config.fp8_communication},
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.o_proj",
                         target_module=Linear1D_Row,
+                        kwargs={"fp8_communication": self.shard_config.fp8_communication},
                     ),
                     SubModuleReplacementDescription(  # or replicate?
-                        suffix="block_sparse_moe.gate", target_module=Linear1D_Col, kwargs={"gather_output": True}
+                        suffix="block_sparse_moe.gate",
+                        target_module=Linear1D_Col,
+                        kwargs={"gather_output": True, "fp8_communication": self.shard_config.fp8_communication},
                     ),
                 ],
             )
@@ -138,7 +144,10 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                 description=SubModuleReplacementDescription(
                     suffix="embed_tokens",
                     target_module=embedding_cls,
-                    kwargs={"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by},
+                    kwargs={
+                        "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                        "fp8_communication": self.shard_config.fp8_communication,
+                    },
                 ),
                 policy=policy,
                 target_key=MixtralModel,
@@ -155,6 +164,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                             "ep_group": self.shard_config.ep_group,
                             "tp_group": self.shard_config.tensor_parallel_process_group,
                             "moe_dp_group": self.shard_config.moe_dp_group,
+                            "fp8_communication": self.shard_config.fp8_communication,
                         },
                     )
                 ],
@@ -282,7 +292,7 @@ def module_policy(self):
                         SubModuleReplacementDescription(
                             suffix="lm_head",
                             target_module=Linear1D_Col,
-                            kwargs=dict(gather_output=True),
+                            kwargs=dict(gather_output=True, fp8_communication=self.shard_config.fp8_communication),
                         )
                     ]
                 )
@@ -336,7 +346,9 @@ def module_policy(self):
                 MixtralForSequenceClassification: ModulePolicyDescription(
                     sub_module_replacement=[
                         SubModuleReplacementDescription(
-                            suffix="score", target_module=Linear1D_Col, kwargs=dict(gather_output=True)
+                            suffix="score",
+                            target_module=Linear1D_Col,
+                            kwargs=dict(gather_output=True, fp8_communication=self.shard_config.fp8_communication),
                         )
                     ]
                 )

From b2483c8e31c7291f1d083d7c9c59bd79203c3c3b Mon Sep 17 00:00:00 2001
From: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Date: Mon, 12 Aug 2024 18:17:05 +0800
Subject: [PATCH 026/167] [fp8] support hybrid parallel plugin (#5982)

* support fp8 comm for qwen2 model

* support fp8 comm for qwen2 model

* support fp8 comm for qwen2 model

* fp8

* fix

* bert and bloom

* chatglm and command

* gpt2,gptj,bert, falcon,blip2

* mistral,opy,sam,t5,vit,whisper

* fix

* fix

* fix
---
 colossalai/shardformer/layer/embedding.py     |  6 +-
 colossalai/shardformer/layer/linear.py        |  2 +
 .../shardformer/layer/qkv_fused_linear.py     |  6 +-
 colossalai/shardformer/modeling/bert.py       | 30 ++++++--
 colossalai/shardformer/modeling/bloom.py      | 20 ++++-
 colossalai/shardformer/modeling/chatglm2.py   | 37 +++++++++-
 colossalai/shardformer/modeling/command.py    | 30 ++++++--
 colossalai/shardformer/modeling/gpt2.py       |  2 +
 colossalai/shardformer/modeling/gptj.py       |  4 +
 colossalai/shardformer/modeling/llama.py      | 16 +++-
 colossalai/shardformer/modeling/qwen2.py      | 30 ++++++--
 colossalai/shardformer/policies/bert.py       | 22 +++++-
 colossalai/shardformer/policies/blip2.py      | 70 +++++++++++++++++-
 colossalai/shardformer/policies/bloom.py      | 40 ++++++++--
 colossalai/shardformer/policies/chatglm2.py   | 16 +++-
 colossalai/shardformer/policies/command.py    | 24 ++++--
 colossalai/shardformer/policies/falcon.py     |  9 ++-
 colossalai/shardformer/policies/gpt2.py       | 11 ++-
 colossalai/shardformer/policies/gptj.py       | 22 +++++-
 colossalai/shardformer/policies/llama.py      | 10 ++-
 colossalai/shardformer/policies/mistral.py    | 39 +++++++++-
 colossalai/shardformer/policies/opt.py        | 22 +++++-
 colossalai/shardformer/policies/qwen2.py      | 33 ++++++---
 colossalai/shardformer/policies/sam.py        | 64 ++++++++++++++++
 colossalai/shardformer/policies/t5.py         | 73 +++++++++++++++++--
 colossalai/shardformer/policies/vit.py        | 20 ++++-
 colossalai/shardformer/policies/whisper.py    | 58 ++++++++++++++-
 27 files changed, 633 insertions(+), 83 deletions(-)

diff --git a/colossalai/shardformer/layer/embedding.py b/colossalai/shardformer/layer/embedding.py
index 186063503fd4..18efb0ec5d2d 100644
--- a/colossalai/shardformer/layer/embedding.py
+++ b/colossalai/shardformer/layer/embedding.py
@@ -68,6 +68,7 @@ def __init__(
         gather_output: bool = True,
         weight: Optional[nn.Parameter] = None,
         weight_initializer: Callable = init.normal_(),
+        fp8_communication: bool = False,
         *args,
         **kwargs,
     ):
@@ -81,6 +82,7 @@ def __init__(
         self.embed_args = args
         self.embed_kwargs = kwargs
         self.gather_output = gather_output
+        self.fp8_communication = fp8_communication
 
         # offset the seed with randomizer index and rank
         seed = torch.random.initial_seed()
@@ -155,7 +157,9 @@ def _fill_padding_idx_with_zero(self) -> None:
     def forward(self, input_: Tensor) -> Tensor:
         output_parallel = F.embedding(input_, self.weight, self.padding_idx, *self.embed_args, **self.embed_kwargs)
         if self.gather_output:
-            output = gather_forward_split_backward(output_parallel, dim=-1, process_group=self.process_group)
+            output = gather_forward_split_backward(
+                output_parallel, dim=-1, process_group=self.process_group, fp8_communication=self.fp8_communication
+            )
             return output
         else:
             return output_parallel
diff --git a/colossalai/shardformer/layer/linear.py b/colossalai/shardformer/layer/linear.py
index 38a6ef1a19ae..4e5ebef0dcb6 100644
--- a/colossalai/shardformer/layer/linear.py
+++ b/colossalai/shardformer/layer/linear.py
@@ -572,6 +572,7 @@ def __init__(
         weight: Optional[Parameter] = None,
         bias_: Optional[Parameter] = None,
         make_vocab_size_divisible_by: int = 64,
+        fp8_communication: bool = False,
         **kwargs,
     ):
         # create weight and bias
@@ -602,6 +603,7 @@ def __init__(
             **kwargs,
             new_num_embeddings=new_out_features,
             old_num_embeddings=out_features,
+            fp8_communication=fp8_communication,
         )
         # get the length of valid embeddings
         tp_rank = dist.get_rank(process_group)
diff --git a/colossalai/shardformer/layer/qkv_fused_linear.py b/colossalai/shardformer/layer/qkv_fused_linear.py
index 93a7eb231b0e..561867993402 100644
--- a/colossalai/shardformer/layer/qkv_fused_linear.py
+++ b/colossalai/shardformer/layer/qkv_fused_linear.py
@@ -627,6 +627,7 @@ def __init__(
         bias_: Optional[Parameter] = None,
         weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
         bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
+        fp8_communication: bool = False,
     ):
         super().__init__()
         # Keep input parameters
@@ -638,6 +639,7 @@ def __init__(
         self.n_fused = n_fused
         self.process_group = process_group
         self.async_communication = async_communication
+        self.fp8_communication = fp8_communication
 
         if skip_bias_add and not bias:
             raise ValueError("cannot skip bias addition if bias is None")
@@ -767,7 +769,9 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
 
         if self.gather_output:
             # All-gather across the partitions.
-            output = gather_forward_split_backward(output_parallel, dim=-1, process_group=self.process_group)
+            output = gather_forward_split_backward(
+                output_parallel, dim=-1, process_group=self.process_group, fp8_communication=self.fp8_communication
+            )
         else:
             output = output_parallel
 
diff --git a/colossalai/shardformer/modeling/bert.py b/colossalai/shardformer/modeling/bert.py
index 7710b56e7cd9..580f3618c6dc 100644
--- a/colossalai/shardformer/modeling/bert.py
+++ b/colossalai/shardformer/modeling/bert.py
@@ -187,11 +187,17 @@ def bert_model_forward(
         if shard_config is not None and shard_config.enable_sequence_parallelism:
             if shard_config.sequence_parallelism_mode == "split_gather":
                 hidden_states = split_forward_gather_backward(
-                    hidden_states, dim=1, process_group=shard_config.tensor_parallel_process_group
+                    hidden_states,
+                    dim=1,
+                    process_group=shard_config.tensor_parallel_process_group,
+                    fp8_communication=shard_config.fp8_communication,
                 )
                 if encoder_hidden_states is not None:
                     encoder_hidden_states = split_forward_gather_backward(
-                        encoder_hidden_states, dim=1, process_group=shard_config.tensor_parallel_process_group
+                        encoder_hidden_states,
+                        dim=1,
+                        process_group=shard_config.tensor_parallel_process_group,
+                        fp8_communication=shard_config.fp8_communication,
                     )
 
         for idx, encoder_layer in enumerate(self.encoder.layer[start_idx:end_idx], start=start_idx):
@@ -242,7 +248,10 @@ def custom_forward(*inputs):
         if shard_config is not None and shard_config.enable_sequence_parallelism:
             if shard_config.sequence_parallelism_mode == "split_gather":
                 hidden_states = gather_forward_split_backward(
-                    hidden_states, dim=1, process_group=shard_config.tensor_parallel_process_group
+                    hidden_states,
+                    dim=1,
+                    process_group=shard_config.tensor_parallel_process_group,
+                    fp8_communication=shard_config.fp8_communication,
                 )
 
         if output_hidden_states:
@@ -1135,11 +1144,17 @@ def forward(
         # split the input tensor along sequence dimension
         # [batch_size, seq_len, hidden_size] -> [batch_size, seq_len/TP_size, hidden_size]
         embedding_output = split_forward_gather_backward(
-            embedding_output, dim=1, process_group=shard_config.tensor_parallel_process_group
+            embedding_output,
+            dim=1,
+            process_group=shard_config.tensor_parallel_process_group,
+            fp8_communication=shard_config.fp8_communication,
         )
         if encoder_hidden_states is not None:
             encoder_hidden_states = split_forward_gather_backward(
-                encoder_hidden_states, dim=1, process_group=shard_config.tensor_parallel_process_group
+                encoder_hidden_states,
+                dim=1,
+                process_group=shard_config.tensor_parallel_process_group,
+                fp8_communication=shard_config.fp8_communication,
             )
 
         encoder_outputs = self.encoder(
@@ -1159,7 +1174,10 @@ def forward(
 
         # When sequence parallelism done, gather the output tensor in forward and split it in backward
         sequence_output = gather_forward_split_backward(
-            sequence_output, dim=1, process_group=shard_config.tensor_parallel_process_group
+            sequence_output,
+            dim=1,
+            process_group=shard_config.tensor_parallel_process_group,
+            fp8_communication=shard_config.fp8_communication,
         )
 
         pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
diff --git a/colossalai/shardformer/modeling/bloom.py b/colossalai/shardformer/modeling/bloom.py
index 26ffef6c5ee0..f8fd4665f1bb 100644
--- a/colossalai/shardformer/modeling/bloom.py
+++ b/colossalai/shardformer/modeling/bloom.py
@@ -221,7 +221,10 @@ def bloom_model_forward(
         if shard_config and shard_config.enable_sequence_parallelism:
             if shard_config.sequence_parallelism_mode == "split_gather":
                 hidden_states = split_forward_gather_backward(
-                    hidden_states, dim=1, process_group=shard_config.tensor_parallel_process_group
+                    hidden_states,
+                    dim=1,
+                    process_group=shard_config.tensor_parallel_process_group,
+                    fp8_communication=shard_config.fp8_communication,
                 )
 
         start_idx, end_idx = stage_index[0], stage_index[1]
@@ -264,7 +267,10 @@ def bloom_model_forward(
         if shard_config and shard_config.enable_sequence_parallelism:
             if shard_config.sequence_parallelism_mode == "split_gather":
                 hidden_states = gather_forward_split_backward(
-                    hidden_states, dim=1, process_group=shard_config.tensor_parallel_process_group
+                    hidden_states,
+                    dim=1,
+                    process_group=shard_config.tensor_parallel_process_group,
+                    fp8_communication=shard_config.fp8_communication,
                 )
 
         if stage_manager.is_last_stage():
@@ -922,7 +928,10 @@ def forward(
         # split the input tensor along sequence dimension
         # [batch_size, seq_len, hidden_size] -> [batch_size, seq_len/TP_size, hidden_size]
         hidden_states = split_forward_gather_backward(
-            hidden_states, dim=1, process_group=shard_config.tensor_parallel_process_group
+            hidden_states,
+            dim=1,
+            process_group=shard_config.tensor_parallel_process_group,
+            fp8_communication=shard_config.fp8_communication,
         )
 
         for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
@@ -960,7 +969,10 @@ def forward(
 
         # When sequence parallelism done, gather the output tensor in forward and split it in backward
         hidden_states = gather_forward_split_backward(
-            hidden_states, dim=1, process_group=shard_config.tensor_parallel_process_group
+            hidden_states,
+            dim=1,
+            process_group=shard_config.tensor_parallel_process_group,
+            fp8_communication=shard_config.fp8_communication,
         )
         # Add last hidden state
         hidden_states = self.ln_f(hidden_states)
diff --git a/colossalai/shardformer/modeling/chatglm2.py b/colossalai/shardformer/modeling/chatglm2.py
index 34d900d8de94..5fd7b6461f7e 100644
--- a/colossalai/shardformer/modeling/chatglm2.py
+++ b/colossalai/shardformer/modeling/chatglm2.py
@@ -206,6 +206,7 @@ def chatglm_model_forward(
                     hidden_states,
                     dim=0,
                     process_group=shard_config.tensor_parallel_process_group,
+                    fp8_communication=shard_config.fp8_communication,
                 )
             elif shard_config.sequence_parallelism_mode == "all_to_all":
                 hidden_states = split_forward_gather_backward(
@@ -213,6 +214,7 @@ def chatglm_model_forward(
                     dim=0,
                     process_group=shard_config.sequence_parallel_process_group,
                     grad_scale=1 / shard_config.sequence_parallel_size,
+                    fp8_communication=shard_config.fp8_communication,
                 )
         for idx in range(start_idx, end_idx):
             layer = self.encoder._get_layer(idx)
@@ -245,6 +247,7 @@ def chatglm_model_forward(
                     hidden_states,
                     dim=0,
                     process_group=shard_config.tensor_parallel_process_group,
+                    fp8_communication=shard_config.fp8_communication,
                 )
             elif shard_config.sequence_parallelism_mode == "all_to_all":
                 hidden_states = gather_forward_split_backward(
@@ -252,6 +255,7 @@ def chatglm_model_forward(
                     dim=0,
                     process_group=shard_config.sequence_parallel_process_group,
                     grad_scale=shard_config.sequence_parallel_size,
+                    fp8_communication=shard_config.fp8_communication,
                 )
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
@@ -414,6 +418,7 @@ def forward(
                 inputs_embeds,
                 dim=0,
                 process_group=sp_group,
+                fp8_communication=shard_config.fp8_communication,
             )
         elif sp_mode == "all_to_all":
             inputs_embeds = split_forward_gather_backward(
@@ -421,6 +426,7 @@ def forward(
                 dim=0,
                 process_group=sp_group,
                 grad_scale=1 / sp_size,
+                fp8_communication=shard_config.fp8_communication,
             )
         hidden_states, presents, all_hidden_states, all_self_attentions = self.encoder(
             inputs_embeds,
@@ -436,6 +442,7 @@ def forward(
                 hidden_states,
                 dim=0,
                 process_group=shard_config.tensor_parallel_process_group,
+                fp8_communication=shard_config.fp8_communication,
             )
         elif sp_mode == "all_to_all":
             hidden_states = gather_forward_split_backward(
@@ -443,6 +450,7 @@ def forward(
                 dim=0,
                 process_group=sp_group,
                 grad_scale=sp_size,
+                fp8_communication=shard_config.fp8_communication,
             )
 
         if not return_dict:
@@ -532,9 +540,24 @@ def forward(
             key_layer = key_layer.reshape(sq, bs, -1)
             value_layer = value_layer.reshape(sq, bs, -1)
 
-            query_layer = all_to_all_comm(query_layer, sp_group, gather_dim=0)
-            key_layer = all_to_all_comm(key_layer, sp_group, gather_dim=0)
-            value_layer = all_to_all_comm(value_layer, sp_group, gather_dim=0)
+            query_layer = all_to_all_comm(
+                query_layer,
+                sp_group,
+                gather_dim=0,
+                fp8_communication=shard_config.fp8_communication,
+            )
+            key_layer = all_to_all_comm(
+                key_layer,
+                sp_group,
+                gather_dim=0,
+                fp8_communication=shard_config.fp8_communication,
+            )
+            value_layer = all_to_all_comm(
+                value_layer,
+                sp_group,
+                gather_dim=0,
+                fp8_communication=shard_config.fp8_communication,
+            )
 
             query_layer = query_layer.view(
                 sq * sp_size,
@@ -610,7 +633,13 @@ def forward(
 
         context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
         if sp_mode == "all_to_all":
-            context_layer = all_to_all_comm(context_layer, sp_group, gather_dim=2, scatter_dim=0)
+            context_layer = all_to_all_comm(
+                context_layer,
+                sp_group,
+                gather_dim=2,
+                scatter_dim=0,
+                fp8_communication=shard_config.fp8_communication,
+            )
 
         # =================
         # Output. [sq, b, h]
diff --git a/colossalai/shardformer/modeling/command.py b/colossalai/shardformer/modeling/command.py
index 5b36fc7db3b9..1ece0c118d0c 100644
--- a/colossalai/shardformer/modeling/command.py
+++ b/colossalai/shardformer/modeling/command.py
@@ -140,6 +140,7 @@ def command_model_forward(
                     hidden_states,
                     dim=1,
                     process_group=shard_config.tensor_parallel_process_group,
+                    fp8_communication=shard_config.fp8_communication,
                 )
             elif shard_config.sequence_parallelism_mode == "all_to_all":
                 hidden_states = split_forward_gather_backward(
@@ -147,6 +148,7 @@ def command_model_forward(
                     dim=1,
                     process_group=shard_config.sequence_parallel_process_group,
                     grad_scale=1 / shard_config.sequence_parallel_size,
+                    fp8_communication=shard_config.fp8_communication,
                 )
 
         # decoder layers
@@ -211,6 +213,7 @@ def command_model_forward(
                     hidden_states,
                     dim=1,
                     process_group=shard_config.tensor_parallel_process_group,
+                    fp8_communication=shard_config.fp8_communication,
                 )
             elif shard_config.sequence_parallelism_mode == "all_to_all":
                 hidden_states = gather_forward_split_backward(
@@ -218,6 +221,7 @@ def command_model_forward(
                     dim=1,
                     process_group=shard_config.sequence_parallel_process_group,
                     grad_scale=shard_config.sequence_parallel_size,
+                    fp8_communication=shard_config.fp8_communication,
                 )
 
         # add hidden states from the last decoder layer
@@ -382,9 +386,9 @@ def forward(
 
         # sp: all-to-all comminucation when introducing sequence parallel
         if sp_mode == "all_to_all":
-            query_states = all_to_all_comm(query_states, sp_group)
-            key_states = all_to_all_comm(key_states, sp_group)
-            value_states = all_to_all_comm(value_states, sp_group)
+            query_states = all_to_all_comm(query_states, sp_group, fp8_communication=shard_config.fp8_communication)
+            key_states = all_to_all_comm(key_states, sp_group, fp8_communication=shard_config.fp8_communication)
+            value_states = all_to_all_comm(value_states, sp_group, fp8_communication=shard_config.fp8_communication)
             bsz, q_len, _ = query_states.size()
 
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
@@ -446,7 +450,9 @@ def forward(
         # sp: all-to-all comminucation when introducing sequence parallel
         if sp_mode == "all_to_all":
             attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim)
-            attn_output = all_to_all_comm(attn_output, sp_group, scatter_dim=1, gather_dim=2)
+            attn_output = all_to_all_comm(
+                attn_output, sp_group, scatter_dim=1, gather_dim=2, fp8_communication=shard_config.fp8_communication
+            )
         else:
             attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
 
@@ -526,9 +532,13 @@ def forward(
             attention_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
 
         if sp_mode in ["ring", "split_gather"]:
-            inputs_embeds = split_forward_gather_backward(inputs_embeds, 1, sp_group)
+            inputs_embeds = split_forward_gather_backward(
+                inputs_embeds, 1, sp_group, fp8_communication=shard_config.fp8_communication
+            )
         elif sp_mode == "all_to_all":
-            inputs_embeds = split_forward_gather_backward(inputs_embeds, 1, sp_group, 1 / sp_size)
+            inputs_embeds = split_forward_gather_backward(
+                inputs_embeds, 1, sp_group, 1 / sp_size, fp8_communication=shard_config.fp8_communication
+            )
         hidden_states = inputs_embeds
 
         # decoder layers
@@ -573,9 +583,13 @@ def forward(
         hidden_states = self.norm(hidden_states)
 
         if sp_mode == "ring" or sp_mode == "split_gather":
-            hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group)
+            hidden_states = gather_forward_split_backward(
+                hidden_states, 1, sp_group, fp8_communication=shard_config.fp8_communication
+            )
         elif sp_mode == "all_to_all":
-            hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group, grad_scale=sp_size)
+            hidden_states = gather_forward_split_backward(
+                hidden_states, 1, sp_group, grad_scale=sp_size, fp8_communication=shard_config.fp8_communication
+            )
 
         # add hidden states from the last decoder layer
         if output_hidden_states:
diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index 0dbf0ca5af36..97544e1105d6 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -221,6 +221,7 @@ def gpt2_model_forward(
                     hidden_states,
                     dim=1,
                     process_group=shard_config.tensor_parallel_process_group,
+                    fp8_communication=shard_config.fp8_communication,
                 )
 
         # Going through held blocks.
@@ -276,6 +277,7 @@ def gpt2_model_forward(
                     hidden_states,
                     dim=1,
                     process_group=shard_config.tensor_parallel_process_group,
+                    fp8_communication=shard_config.fp8_communication,
                 )
 
         if stage_manager.is_last_stage():
diff --git a/colossalai/shardformer/modeling/gptj.py b/colossalai/shardformer/modeling/gptj.py
index facd2fcafbae..51b228712bf5 100644
--- a/colossalai/shardformer/modeling/gptj.py
+++ b/colossalai/shardformer/modeling/gptj.py
@@ -185,6 +185,7 @@ def gptj_model_forward(
                 hidden_states,
                 dim=1,
                 process_group=shard_config.tensor_parallel_process_group,
+                fp8_communication=shard_config.fp8_communication,
             )
 
         # Going through held blocks.
@@ -236,6 +237,7 @@ def gptj_model_forward(
                 hidden_states,
                 dim=1,
                 process_group=shard_config.tensor_parallel_process_group,
+                fp8_communication=shard_config.fp8_communication,
             )
 
         if stage_manager.is_last_stage():
@@ -915,6 +917,7 @@ def forward(
             hidden_states,
             dim=1,
             process_group=shard_config.tensor_parallel_process_group,
+            fp8_communication=shard_config.fp8_communication,
         )
 
         for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
@@ -978,6 +981,7 @@ def custom_forward(*inputs):
             hidden_states,
             dim=1,
             process_group=shard_config.tensor_parallel_process_group,
+            fp8_communication=shard_config.fp8_communication,
         )
 
         hidden_states = self.ln_f(hidden_states)
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index 693f6584f3a7..99b31745bb05 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -143,9 +143,13 @@ def llama_model_forward(
         # Support SP + PP
         if stage_manager.is_first_stage():
             if sp_mode in ["ring", "split_gather"]:
-                hidden_states = split_forward_gather_backward(hidden_states, 1, sp_group)
+                hidden_states = split_forward_gather_backward(
+                    hidden_states, 1, sp_group, fp8_communication=shard_config.fp8_communication
+                )
             elif sp_mode == "all_to_all":
-                hidden_states = split_forward_gather_backward(hidden_states, 1, sp_group, 1 / sp_size)
+                hidden_states = split_forward_gather_backward(
+                    hidden_states, 1, sp_group, 1 / sp_size, fp8_communication=shard_config.fp8_communication
+                )
 
         if self.gradient_checkpointing and self.training and use_cache:
             if use_cache:
@@ -210,9 +214,13 @@ def llama_model_forward(
         if stage_manager.is_last_stage():
             hidden_states = self.norm(hidden_states)
             if sp_mode == "ring" or sp_mode == "split_gather":
-                hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group)
+                hidden_states = gather_forward_split_backward(
+                    hidden_states, 1, sp_group, fp8_communication=shard_config.fp8_communication
+                )
             elif sp_mode == "all_to_all":
-                hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group, grad_scale=sp_size)
+                hidden_states = gather_forward_split_backward(
+                    hidden_states, 1, sp_group, grad_scale=sp_size, fp8_communication=shard_config.fp8_communication
+                )
 
         # add hidden states from the last decoder layer
         if output_hidden_states:
diff --git a/colossalai/shardformer/modeling/qwen2.py b/colossalai/shardformer/modeling/qwen2.py
index 538e96c32c6d..a61360d17570 100644
--- a/colossalai/shardformer/modeling/qwen2.py
+++ b/colossalai/shardformer/modeling/qwen2.py
@@ -175,6 +175,7 @@ def qwen2_model_forward(
                     hidden_states,
                     dim=1,
                     process_group=shard_config.tensor_parallel_process_group,
+                    fp8_communication=shard_config.fp8_communication,
                 )
             elif shard_config.sequence_parallelism_mode == "all_to_all":
                 hidden_states = split_forward_gather_backward(
@@ -182,6 +183,7 @@ def qwen2_model_forward(
                     dim=1,
                     process_group=shard_config.sequence_parallel_process_group,
                     grad_scale=1 / shard_config.sequence_parallel_size,
+                    fp8_communication=shard_config.fp8_communication,
                 )
 
         # decoder layers
@@ -246,6 +248,7 @@ def qwen2_model_forward(
                     hidden_states,
                     dim=1,
                     process_group=shard_config.tensor_parallel_process_group,
+                    fp8_communication=shard_config.fp8_communication,
                 )
             elif shard_config.sequence_parallelism_mode == "all_to_all":
                 hidden_states = gather_forward_split_backward(
@@ -253,6 +256,7 @@ def qwen2_model_forward(
                     dim=1,
                     process_group=shard_config.sequence_parallel_process_group,
                     grad_scale=shard_config.sequence_parallel_size,
+                    fp8_communication=shard_config.fp8_communication,
                 )
         # add hidden states from the last decoder layer
         if output_hidden_states:
@@ -516,9 +520,9 @@ def forward(
         value_states = self.v_proj(hidden_states)
         # sp: all-to-all comminucation when introducing sequence parallel
         if sp_mode == "all_to_all":
-            query_states = all_to_all_comm(query_states, sp_group)
-            key_states = all_to_all_comm(key_states, sp_group)
-            value_states = all_to_all_comm(value_states, sp_group)
+            query_states = all_to_all_comm(query_states, sp_group, fp8_communication=shard_config.fp8_communication)
+            key_states = all_to_all_comm(key_states, sp_group, fp8_communication=shard_config.fp8_communication)
+            value_states = all_to_all_comm(value_states, sp_group, fp8_communication=shard_config.fp8_communication)
             bsz, q_len, _ = query_states.size()
 
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
@@ -604,7 +608,9 @@ def forward(
         attn_output = attn_output.transpose(1, 2).contiguous()
         if sp_mode == "all_to_all":
             attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim)
-            attn_output = all_to_all_comm(attn_output, sp_group, scatter_dim=1, gather_dim=2)
+            attn_output = all_to_all_comm(
+                attn_output, sp_group, scatter_dim=1, gather_dim=2, fp8_communication=shard_config.fp8_communication
+            )
         else:
             attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
 
@@ -702,9 +708,13 @@ def forward(
         next_decoder_cache = None
 
         if sp_mode in ["ring", "split_gather"]:
-            hidden_states = split_forward_gather_backward(hidden_states, 1, sp_group)
+            hidden_states = split_forward_gather_backward(
+                hidden_states, 1, sp_group, fp8_communication=shard_config.fp8_communication
+            )
         elif sp_mode == "all_to_all":
-            hidden_states = split_forward_gather_backward(hidden_states, 1, sp_group, 1 / sp_size)
+            hidden_states = split_forward_gather_backward(
+                hidden_states, 1, sp_group, 1 / sp_size, fp8_communication=shard_config.fp8_communication
+            )
 
         for decoder_layer in self.layers:
             if output_hidden_states:
@@ -741,9 +751,13 @@ def forward(
         hidden_states = self.norm(hidden_states)
 
         if sp_mode == "ring" or sp_mode == "split_gather":
-            hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group)
+            hidden_states = gather_forward_split_backward(
+                hidden_states, 1, sp_group, fp8_communication=shard_config.fp8_communication
+            )
         elif sp_mode == "all_to_all":
-            hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group, grad_scale=sp_size)
+            hidden_states = gather_forward_split_backward(
+                hidden_states, 1, sp_group, grad_scale=sp_size, fp8_communication=shard_config.fp8_communication
+            )
 
         # add hidden states from the last decoder layer
         if output_hidden_states:
diff --git a/colossalai/shardformer/policies/bert.py b/colossalai/shardformer/policies/bert.py
index b84a372a5d5f..4c33e14bc2ab 100644
--- a/colossalai/shardformer/policies/bert.py
+++ b/colossalai/shardformer/policies/bert.py
@@ -98,6 +98,7 @@ def module_policy(self):
                         kwargs={
                             "seq_parallel_mode": sp_mode,
                             "overlap": overlap,
+                            "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
                     SubModuleReplacementDescription(
@@ -106,6 +107,7 @@ def module_policy(self):
                         kwargs={
                             "seq_parallel_mode": sp_mode,
                             "overlap": overlap,
+                            "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
                     SubModuleReplacementDescription(
@@ -114,6 +116,7 @@ def module_policy(self):
                         kwargs={
                             "seq_parallel_mode": sp_mode,
                             "overlap": overlap,
+                            "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
                     SubModuleReplacementDescription(
@@ -123,7 +126,10 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="attention.output.dense",
                         target_module=col_nn.Linear1D_Row,
-                        kwargs={"seq_parallel_mode": sp_mode},
+                        kwargs={
+                            "seq_parallel_mode": sp_mode,
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="attention.output.dropout",
@@ -136,12 +142,16 @@ def module_policy(self):
                             "seq_parallel_mode": sp_mode,
                             "overlap": overlap,
                             "skip_bias_add": self.enable_bias_gelu_fused,
+                            "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
                     SubModuleReplacementDescription(
                         suffix="output.dense",
                         target_module=col_nn.Linear1D_Row,
-                        kwargs={"seq_parallel_mode": sp_mode},
+                        kwargs={
+                            "seq_parallel_mode": sp_mode,
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="output.dropout",
@@ -180,6 +190,13 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="word_embeddings",
                         target_module=embedding_cls,
+                        kwargs=(
+                            {
+                                "fp8_communication": self.shard_config.fp8_communication,
+                            }
+                            if self.shard_config.enable_tensor_parallelism
+                            else {}
+                        ),
                     )
                 ],
                 policy=policy,
@@ -249,6 +266,7 @@ def add_lm_head_policy(self, base_policy):
                     kwargs={
                         "gather_output": True,
                         "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                        "fp8_communication": self.shard_config.fp8_communication,
                     },
                 ),
                 policy=base_policy,
diff --git a/colossalai/shardformer/policies/blip2.py b/colossalai/shardformer/policies/blip2.py
index 32d4edadb3e4..da798f6a0521 100644
--- a/colossalai/shardformer/policies/blip2.py
+++ b/colossalai/shardformer/policies/blip2.py
@@ -72,20 +72,30 @@ def module_policy(self):
                         target_module=col_nn.FusedLinear1D_Col,
                         kwargs={
                             "n_fused": 3,
+                            "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.projection",
                         target_module=col_nn.Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.fc1",
                         target_module=col_nn.Linear1D_Col,
-                        kwargs={"skip_bias_add": self.enable_bias_gelu_fused},
+                        kwargs={
+                            "skip_bias_add": self.enable_bias_gelu_fused,
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.fc2",
                         target_module=col_nn.Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                 ],
             )
@@ -114,14 +124,23 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="attention.attention.query",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="attention.attention.key",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="attention.attention.value",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="attention.attention.dropout",
@@ -130,6 +149,9 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="attention.output.dense",
                         target_module=col_nn.Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="attention.output.dropout",
@@ -138,14 +160,23 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="crossattention.attention.query",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="crossattention.attention.key",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="crossattention.attention.value",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="crossattention.attention.dropout",
@@ -154,6 +185,9 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="crossattention.output.dense",
                         target_module=col_nn.Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="crossattention.output.dropout",
@@ -162,10 +196,16 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="intermediate_query.dense",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="output_query.dense",
                         target_module=col_nn.Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="output_query.dropout",
@@ -185,26 +225,44 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="self_attn.q_proj",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.k_proj",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.v_proj",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.out_proj",
                         target_module=col_nn.Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="fc1",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="fc2",
                         target_module=col_nn.Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                 ],
             )
@@ -225,7 +283,14 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="model.decoder.embed_tokens",
                         target_module=embedding_cls,
-                        kwargs={"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by},
+                        kwargs=(
+                            {
+                                "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                                "fp8_communication": self.shard_config.fp8_communication,
+                            }
+                            if self.shard_config.enable_tensor_parallelism
+                            else {"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by}
+                        ),
                     ),
                 ],
                 policy=policy,
@@ -241,6 +306,7 @@ def module_policy(self):
                         kwargs={
                             "gather_output": True,
                             "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                            "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
                 ],
diff --git a/colossalai/shardformer/policies/bloom.py b/colossalai/shardformer/policies/bloom.py
index d80adb84a756..a43ac02d0cd7 100644
--- a/colossalai/shardformer/policies/bloom.py
+++ b/colossalai/shardformer/policies/bloom.py
@@ -76,12 +76,19 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="self_attention.query_key_value",
                         target_module=col_nn.Linear1D_Col,
-                        kwargs={"seq_parallel_mode": sp_mode, "overlap": overlap},
+                        kwargs={
+                            "seq_parallel_mode": sp_mode,
+                            "overlap": overlap,
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attention.dense",
                         target_module=col_nn.Linear1D_Row,
-                        kwargs={"seq_parallel_mode": sp_mode},
+                        kwargs={
+                            "seq_parallel_mode": sp_mode,
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attention.attention_dropout",
@@ -90,12 +97,19 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="mlp.dense_h_to_4h",
                         target_module=col_nn.Linear1D_Col,
-                        kwargs={"seq_parallel_mode": sp_mode, "overlap": overlap},
+                        kwargs={
+                            "seq_parallel_mode": sp_mode,
+                            "overlap": overlap,
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.dense_4h_to_h",
                         target_module=col_nn.Linear1D_Row,
-                        kwargs={"seq_parallel_mode": sp_mode},
+                        kwargs={
+                            "seq_parallel_mode": sp_mode,
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                 ],
             )
@@ -115,7 +129,14 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="word_embeddings",
                         target_module=embedding_cls,
-                        kwargs={"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by},
+                        kwargs=(
+                            {
+                                "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                                "fp8_communication": self.shard_config.fp8_communication,
+                            }
+                            if self.shard_config.enable_tensor_parallelism
+                            else {"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by}
+                        ),
                     ),
                 ],
                 policy=policy,
@@ -279,6 +300,7 @@ def module_policy(self):
                     kwargs=dict(
                         gather_output=not self.shard_config.parallel_output,
                         make_vocab_size_divisible_by=self.shard_config.make_vocab_size_divisible_by,
+                        fp8_communication=self.shard_config.fp8_communication,
                     ),
                 ),
                 policy=policy,
@@ -337,7 +359,9 @@ def module_policy(self):
         if self.shard_config.enable_tensor_parallelism:
             self.append_or_create_submodule_replacement(
                 description=SubModuleReplacementDescription(
-                    suffix="score", target_module=col_nn.Linear1D_Col, kwargs=dict(gather_output=True)
+                    suffix="score",
+                    target_module=col_nn.Linear1D_Col,
+                    kwargs=dict(gather_output=True, fp8_communication=self.shard_config.fp8_communication),
                 ),
                 policy=policy,
                 target_key=BloomForSequenceClassification,
@@ -374,7 +398,9 @@ def module_policy(self):
             self.append_or_create_submodule_replacement(
                 description=[
                     SubModuleReplacementDescription(
-                        suffix="classifier", target_module=col_nn.Linear1D_Col, kwargs=dict(gather_output=True)
+                        suffix="classifier",
+                        target_module=col_nn.Linear1D_Col,
+                        kwargs=dict(gather_output=True, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="dropout",
diff --git a/colossalai/shardformer/policies/chatglm2.py b/colossalai/shardformer/policies/chatglm2.py
index 3877bdac3ae2..16c13085ade1 100644
--- a/colossalai/shardformer/policies/chatglm2.py
+++ b/colossalai/shardformer/policies/chatglm2.py
@@ -128,12 +128,17 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                             "seq_parallel_mode": sp_mode,
                             "seq_parallel_dim": 0,
                             "overlap": overlap,
+                            "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attention.dense",
                         target_module=col_nn.Linear1D_Row,
-                        kwargs={"seq_parallel_mode": sp_mode, "seq_parallel_dim": 0},
+                        kwargs={
+                            "seq_parallel_mode": sp_mode,
+                            "seq_parallel_dim": 0,
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attention.core_attention.attention_dropout",
@@ -148,7 +153,14 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                     SubModuleReplacementDescription(
                         suffix="embedding.word_embeddings",
                         target_module=embedding_cls,
-                        kwargs={"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by},
+                        kwargs=(
+                            {
+                                "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                                "fp8_communication": self.shard_config.fp8_communication,
+                            }
+                            if self.shard_config.enable_tensor_parallelism
+                            else {"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by}
+                        ),
                     ),
                 ],
                 policy=policy,
diff --git a/colossalai/shardformer/policies/command.py b/colossalai/shardformer/policies/command.py
index a9b915d10485..d0903b11c9f3 100644
--- a/colossalai/shardformer/policies/command.py
+++ b/colossalai/shardformer/policies/command.py
@@ -126,37 +126,37 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                     SubModuleReplacementDescription(
                         suffix="self_attn.q_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.k_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.v_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.o_proj",
                         target_module=Linear1D_Row,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.gate_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.up_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.down_proj",
                         target_module=Linear1D_Row,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                 ],
             )
@@ -166,7 +166,14 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                 description=SubModuleReplacementDescription(
                     suffix="embed_tokens",
                     target_module=embedding_cls,
-                    kwargs={"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by},
+                    kwargs=(
+                        {
+                            "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        }
+                        if self.shard_config.enable_tensor_parallelism
+                        else {"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by}
+                    ),
                 ),
                 policy=policy,
                 target_key=CohereModel,
@@ -303,6 +310,7 @@ def module_policy(self):
                             kwargs={
                                 "gather_output": not self.shard_config.parallel_output,
                                 "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                                "fp8_communication": self.shard_config.fp8_communication,
                             },
                         )
                     ],
diff --git a/colossalai/shardformer/policies/falcon.py b/colossalai/shardformer/policies/falcon.py
index e5c16733752e..e20fb1568505 100644
--- a/colossalai/shardformer/policies/falcon.py
+++ b/colossalai/shardformer/policies/falcon.py
@@ -105,7 +105,14 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="word_embeddings",
                         target_module=embedding_cls,
-                        kwargs={"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by},
+                        kwargs=(
+                            {
+                                "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                                "fp8_communication": self.shard_config.fp8_communication,
+                            }
+                            if self.shard_config.enable_tensor_parallelism
+                            else {"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by}
+                        ),
                     ),
                 ],
                 policy=policy,
diff --git a/colossalai/shardformer/policies/gpt2.py b/colossalai/shardformer/policies/gpt2.py
index bb6269737da7..0a1949d85c74 100644
--- a/colossalai/shardformer/policies/gpt2.py
+++ b/colossalai/shardformer/policies/gpt2.py
@@ -162,7 +162,14 @@ def module_policy(self):
                 description=SubModuleReplacementDescription(
                     suffix="wte",
                     target_module=embedding_cls,
-                    kwargs={"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by},
+                    kwargs=(
+                        {
+                            "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        }
+                        if self.shard_config.enable_tensor_parallelism
+                        else {"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by}
+                    ),
                 ),
                 policy=policy,
                 target_key=GPT2Model,
@@ -332,6 +339,7 @@ def module_policy(self):
                             kwargs={
                                 "gather_output": False,
                                 "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                                "fp8_communication": self.shard_config.fp8_communication,
                             },
                         )
                     ],
@@ -402,6 +410,7 @@ def module_policy(self):
                             kwargs={
                                 "gather_output": True,
                                 "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                                "fp8_communication": self.shard_config.fp8_communication,
                             },
                         )
                     ]
diff --git a/colossalai/shardformer/policies/gptj.py b/colossalai/shardformer/policies/gptj.py
index c394d911e289..6f0c8803c3f1 100644
--- a/colossalai/shardformer/policies/gptj.py
+++ b/colossalai/shardformer/policies/gptj.py
@@ -77,6 +77,7 @@ def module_policy(self):
                         target_module=col_nn.Linear1D_Col,
                         kwargs={
                             "overlap": overlap,
+                            "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
                     SubModuleReplacementDescription(
@@ -84,6 +85,7 @@ def module_policy(self):
                         target_module=col_nn.Linear1D_Col,
                         kwargs={
                             "overlap": overlap,
+                            "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
                     SubModuleReplacementDescription(
@@ -91,19 +93,29 @@ def module_policy(self):
                         target_module=col_nn.Linear1D_Col,
                         kwargs={
                             "overlap": overlap,
+                            "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
                     SubModuleReplacementDescription(
                         suffix="attn.out_proj",
                         target_module=col_nn.Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.fc_in",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.fc_out",
                         target_module=col_nn.Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="attn.attn_dropout",
@@ -125,7 +137,14 @@ def module_policy(self):
                 description=SubModuleReplacementDescription(
                     suffix="wte",
                     target_module=embedding_cls,
-                    kwargs={"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by},
+                    kwargs=(
+                        {
+                            "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        }
+                        if self.shard_config.enable_tensor_parallelism
+                        else {"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by}
+                    ),
                 ),
                 policy=policy,
                 target_key=GPTJModel,
@@ -264,6 +283,7 @@ def module_policy(self):
                             kwargs={
                                 "gather_output": True,
                                 "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                                "fp8_communication": self.shard_config.fp8_communication,
                             },
                         )
                     ]
diff --git a/colossalai/shardformer/policies/llama.py b/colossalai/shardformer/policies/llama.py
index 6f8404219364..23d2d39134d3 100644
--- a/colossalai/shardformer/policies/llama.py
+++ b/colossalai/shardformer/policies/llama.py
@@ -166,7 +166,14 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                 description=SubModuleReplacementDescription(
                     suffix="embed_tokens",
                     target_module=embedding_cls,
-                    kwargs={"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by},
+                    kwargs=(
+                        {
+                            "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        }
+                        if self.shard_config.enable_tensor_parallelism
+                        else {"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by}
+                    ),
                 ),
                 policy=policy,
                 target_key=LlamaModel,
@@ -308,6 +315,7 @@ def module_policy(self):
                             kwargs={
                                 "gather_output": not self.shard_config.parallel_output,
                                 "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                                "fp8_communication": self.shard_config.fp8_communication,
                             },
                         )
                     ],
diff --git a/colossalai/shardformer/policies/mistral.py b/colossalai/shardformer/policies/mistral.py
index c5a0277a5783..72a5158e5269 100644
--- a/colossalai/shardformer/policies/mistral.py
+++ b/colossalai/shardformer/policies/mistral.py
@@ -88,30 +88,51 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                     SubModuleReplacementDescription(
                         suffix="self_attn.q_proj",
                         target_module=Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.k_proj",
                         target_module=Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.v_proj",
                         target_module=Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.o_proj",
                         target_module=Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.gate_proj",
                         target_module=Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.up_proj",
                         target_module=Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.down_proj",
                         target_module=Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                 ],
             )
@@ -121,7 +142,14 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                 description=SubModuleReplacementDescription(
                     suffix="embed_tokens",
                     target_module=embedding_cls,
-                    kwargs={"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by},
+                    kwargs=(
+                        {
+                            "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        }
+                        if self.shard_config.enable_tensor_parallelism
+                        else {"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by}
+                    ),
                 ),
                 policy=policy,
                 target_key=MistralModel,
@@ -281,6 +309,7 @@ def module_policy(self):
                             kwargs={
                                 "gather_output": not self.shard_config.parallel_output,
                                 "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                                "fp8_communication": self.shard_config.fp8_communication,
                             },
                         )
                     ]
@@ -297,7 +326,9 @@ def module_policy(self):
                         SubModuleReplacementDescription(
                             suffix="lm_head",
                             target_module=PaddingLMHead,
-                            kwargs=dict(make_vocab_size_divisible_by=self.shard_config.make_vocab_size_divisible_by),
+                            kwargs=dict(
+                                make_vocab_size_divisible_by=self.shard_config.make_vocab_size_divisible_by,
+                            ),
                         )
                     ]
                 )
@@ -350,7 +381,9 @@ def module_policy(self):
                 MistralForSequenceClassification: ModulePolicyDescription(
                     sub_module_replacement=[
                         SubModuleReplacementDescription(
-                            suffix="score", target_module=Linear1D_Col, kwargs=dict(gather_output=True)
+                            suffix="score",
+                            target_module=Linear1D_Col,
+                            kwargs=dict(gather_output=True, fp8_communication=self.shard_config.fp8_communication),
                         )
                     ]
                 )
diff --git a/colossalai/shardformer/policies/opt.py b/colossalai/shardformer/policies/opt.py
index 524d2b8cd0c3..dd64ce652f86 100644
--- a/colossalai/shardformer/policies/opt.py
+++ b/colossalai/shardformer/policies/opt.py
@@ -102,18 +102,30 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="q_proj",
                         target_module=Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="k_proj",
                         target_module=Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="v_proj",
                         target_module=Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="out_proj",
                         target_module=Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                 ],
             )
@@ -123,7 +135,14 @@ def module_policy(self):
                 description=SubModuleReplacementDescription(
                     suffix="embed_tokens",
                     target_module=embedding_cls,
-                    kwargs={"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by},
+                    kwargs=(
+                        {
+                            "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        }
+                        if self.shard_config.enable_tensor_parallelism
+                        else {"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by}
+                    ),
                 ),
                 policy=policy,
                 target_key=OPTDecoder,
@@ -272,6 +291,7 @@ def module_policy(self):
                     kwargs=dict(
                         gather_output=not self.shard_config.parallel_output,
                         make_vocab_size_divisible_by=self.shard_config.make_vocab_size_divisible_by,
+                        fp8_communication=self.shard_config.fp8_communication,
                     ),
                 ),
                 policy=policy,
diff --git a/colossalai/shardformer/policies/qwen2.py b/colossalai/shardformer/policies/qwen2.py
index 362c14060fd9..1b066200de64 100644
--- a/colossalai/shardformer/policies/qwen2.py
+++ b/colossalai/shardformer/policies/qwen2.py
@@ -119,37 +119,37 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                     SubModuleReplacementDescription(
                         suffix="self_attn.q_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.k_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.v_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.o_proj",
                         target_module=Linear1D_Row,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.gate_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.up_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.down_proj",
                         target_module=Linear1D_Row,
-                        kwargs=dict(seq_parallel_mode=sp_mode),
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                 ],
             )
@@ -159,7 +159,14 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                 description=SubModuleReplacementDescription(
                     suffix="embed_tokens",
                     target_module=embedding_cls,
-                    kwargs={"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by},
+                    kwargs=(
+                        {
+                            "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        }
+                        if self.shard_config.enable_tensor_parallelism
+                        else {"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by}
+                    ),
                 ),
                 policy=policy,
                 target_key=Qwen2Model,
@@ -317,7 +324,11 @@ def module_policy(self):
             new_item = {
                 Qwen2ForCausalLM: ModulePolicyDescription(
                     sub_module_replacement=[
-                        SubModuleReplacementDescription(suffix="lm_head", target_module=Linear1D_Col)
+                        SubModuleReplacementDescription(
+                            suffix="lm_head",
+                            target_module=Linear1D_Col,
+                            kwargs=dict(fp8_communication=self.shard_config.fp8_communication),
+                        )
                     ],
                     method_replacement={"forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)},
                 )
@@ -366,7 +377,9 @@ def module_policy(self):
                 Qwen2ForSequenceClassification: ModulePolicyDescription(
                     sub_module_replacement=[
                         SubModuleReplacementDescription(
-                            suffix="score", target_module=Linear1D_Col, kwargs=dict(gather_output=True)
+                            suffix="score",
+                            target_module=Linear1D_Col,
+                            kwargs=dict(gather_output=True, fp8_communication=self.shard_config.fp8_communication),
                         )
                     ]
                 )
diff --git a/colossalai/shardformer/policies/sam.py b/colossalai/shardformer/policies/sam.py
index 53faf8997f02..674fe5e58799 100644
--- a/colossalai/shardformer/policies/sam.py
+++ b/colossalai/shardformer/policies/sam.py
@@ -43,19 +43,29 @@ def module_policy(self):
                         target_module=col_nn.FusedLinear1D_Col,
                         kwargs={
                             "n_fused": 3,
+                            "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
                     SubModuleReplacementDescription(
                         suffix="attn.proj",
                         target_module=col_nn.Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.lin1",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.lin2",
                         target_module=col_nn.Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                 ],
             )
@@ -68,58 +78,100 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="self_attn.q_proj",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.k_proj",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.v_proj",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.out_proj",
                         target_module=col_nn.Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="cross_attn_token_to_image.q_proj",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="cross_attn_token_to_image.k_proj",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="cross_attn_token_to_image.v_proj",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="cross_attn_token_to_image.out_proj",
                         target_module=col_nn.Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.lin1",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.lin2",
                         target_module=col_nn.Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="cross_attn_image_to_token.q_proj",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="cross_attn_image_to_token.k_proj",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="cross_attn_image_to_token.v_proj",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="cross_attn_image_to_token.out_proj",
                         target_module=col_nn.Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                 ],
             )
@@ -132,18 +184,30 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="final_attn_token_to_image.q_proj",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="final_attn_token_to_image.k_proj",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="final_attn_token_to_image.v_proj",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="final_attn_token_to_image.out_proj",
                         target_module=col_nn.Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                 ],
             )
diff --git a/colossalai/shardformer/policies/t5.py b/colossalai/shardformer/policies/t5.py
index 0b594678c71b..84b5d95947f0 100644
--- a/colossalai/shardformer/policies/t5.py
+++ b/colossalai/shardformer/policies/t5.py
@@ -117,23 +117,38 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="q",
                         target_module=Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="k",
                         target_module=Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="v",
                         target_module=Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="o",
                         target_module=Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="relative_attention_bias",
                         target_module=Embedding1D,
-                        kwargs=dict(gather_output=False),
+                        kwargs=dict(
+                            gather_output=False,
+                            fp8_communication=self.shard_config.fp8_communication,
+                        ),
                         ignore_if_not_exist=True,
                     ),
                 ],
@@ -151,13 +166,24 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="wi_0 ",
                         target_module=Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="wi_1",
                         target_module=Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
-                        suffix="wo", target_module=Linear1D_Col, kwargs=dict(gather_output=True)
+                        suffix="wo",
+                        target_module=Linear1D_Col,
+                        kwargs=dict(
+                            gather_output=True,
+                            fp8_communication=self.shard_config.fp8_communication,
+                        ),
                     ),
                     SubModuleReplacementDescription(
                         suffix="dropout",
@@ -170,10 +196,16 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="wi",
                         target_module=Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="wo",
                         target_module=Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="dropout",
@@ -187,7 +219,14 @@ def module_policy(self):
                 description=SubModuleReplacementDescription(
                     suffix="embed_tokens",
                     target_module=embedding_cls,
-                    kwargs={"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by},
+                    kwargs=(
+                        {
+                            "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        }
+                        if self.shard_config.enable_tensor_parallelism
+                        else {"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by}
+                    ),
                 ),
                 policy=policy,
                 target_key=T5Stack,
@@ -407,7 +446,14 @@ def module_policy(self):
                 description=SubModuleReplacementDescription(
                     suffix="shared",
                     target_module=embedding_cls,
-                    kwargs={"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by},
+                    kwargs=(
+                        {
+                            "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        }
+                        if self.shard_config.enable_tensor_parallelism
+                        else {"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by}
+                    ),
                 ),
                 policy=policy,
                 target_key=T5Model,
@@ -451,7 +497,14 @@ def module_policy(self):
                 description=SubModuleReplacementDescription(
                     suffix="shared",
                     target_module=embedding_cls,
-                    kwargs={"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by},
+                    kwargs=(
+                        {
+                            "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        }
+                        if self.shard_config.enable_tensor_parallelism
+                        else {"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by}
+                    ),
                 ),
                 policy=policy,
                 target_key=T5ForConditionalGeneration,
@@ -465,6 +518,7 @@ def module_policy(self):
                     kwargs={
                         "gather_output": True,
                         "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                        "fp8_communication": self.shard_config.fp8_communication,
                     },
                 ),
                 policy=policy,
@@ -539,7 +593,14 @@ def module_policy(self):
                 description=SubModuleReplacementDescription(
                     suffix="shared",
                     target_module=embedding_cls,
-                    kwargs={"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by},
+                    kwargs=(
+                        {
+                            "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        }
+                        if self.shard_config.enable_tensor_parallelism
+                        else {"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by}
+                    ),
                 ),
                 policy=policy,
                 target_key=T5EncoderModel,
diff --git a/colossalai/shardformer/policies/vit.py b/colossalai/shardformer/policies/vit.py
index 069ad0c2690c..07202094f1f3 100644
--- a/colossalai/shardformer/policies/vit.py
+++ b/colossalai/shardformer/policies/vit.py
@@ -70,14 +70,23 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                     SubModuleReplacementDescription(
                         suffix="attention.attention.query",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="attention.attention.key",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="attention.attention.value",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="attention.attention.dropout",
@@ -86,6 +95,9 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                     SubModuleReplacementDescription(
                         suffix="attention.output.dense",
                         target_module=col_nn.Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="attention.output.dropout",
@@ -96,11 +108,15 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                         target_module=col_nn.Linear1D_Col,
                         kwargs={
                             "skip_bias_add": self.enable_bias_gelu_fused,
+                            "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
                     SubModuleReplacementDescription(
                         suffix="output.dense",
                         target_module=col_nn.Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="output.dropout",
@@ -215,7 +231,9 @@ def module_policy(self):
                 ViTForImageClassification: ModulePolicyDescription(
                     sub_module_replacement=[
                         SubModuleReplacementDescription(
-                            suffix="classifier", target_module=Linear1D_Col, kwargs=dict(gather_output=True)
+                            suffix="classifier",
+                            target_module=Linear1D_Col,
+                            kwargs=dict(gather_output=True, fp8_communication=self.shard_config.fp8_communication),
                         )
                     ]
                 )
diff --git a/colossalai/shardformer/policies/whisper.py b/colossalai/shardformer/policies/whisper.py
index 441e512bbb28..7a1f146d5bb8 100644
--- a/colossalai/shardformer/policies/whisper.py
+++ b/colossalai/shardformer/policies/whisper.py
@@ -91,26 +91,44 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="self_attn.q_proj",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.k_proj",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.v_proj",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.out_proj",
                         target_module=col_nn.Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="fc1",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="fc2",
                         target_module=col_nn.Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                 ],
             )
@@ -128,42 +146,72 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="self_attn.q_proj",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.k_proj",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.v_proj",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.out_proj",
                         target_module=col_nn.Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="encoder_attn.q_proj",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="encoder_attn.k_proj",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="encoder_attn.v_proj",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="encoder_attn.out_proj",
                         target_module=col_nn.Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="fc1",
                         target_module=col_nn.Linear1D_Col,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="fc2",
                         target_module=col_nn.Linear1D_Row,
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        },
                     ),
                 ],
             )
@@ -174,7 +222,14 @@ def module_policy(self):
                     SubModuleReplacementDescription(
                         suffix="embed_tokens",
                         target_module=embedding_cls,
-                        kwargs={"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by},
+                        kwargs=(
+                            {
+                                "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                                "fp8_communication": self.shard_config.fp8_communication,
+                            }
+                            if self.shard_config.enable_tensor_parallelism
+                            else {"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by}
+                        ),
                     ),
                 ],
                 policy=policy,
@@ -303,6 +358,7 @@ def add_lm_head_policy(self, base_policy):
                     kwargs={
                         "gather_output": True,
                         "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                        "fp8_communication": self.shard_config.fp8_communication,
                     },
                 ),
                 policy=base_policy,

From 0978080a69249befd67994391276b1dd84d0965d Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Tue, 13 Aug 2024 16:07:26 +0800
Subject: [PATCH 027/167] [fp8] refactor fp8 linear with compile (#5993)

* [fp8] refactor fp8 linear with compile

* [fp8] fix linear test

* [fp8] fix linear test
---
 colossalai/quantization/fp8.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index f2bffa09f15d..fe87e317de48 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -7,6 +7,8 @@
 from packaging.version import Version
 from torch.distributed import ReduceOp
 
+SUPPORT_TORCH_COMPILE = Version(torch.__version__) >= Version("2.3.0")
+
 
 def cast_to_fp8(inp: torch.Tensor, fp8_format="e4m3", per_channel_scale=False) -> Tuple[torch.Tensor, torch.Tensor]:
     r"""
@@ -664,13 +666,14 @@ def backward(ctx: Any, out_grad) -> Any:
         return x_grad.reshape(ctx.x_shape), w_grad, bias_grad
 
 
-if Version(torch.__version__) >= Version("2.3.0"):  # TODO failed on torch < 2.3.0
-
-    @torch.compile(mode="reduce-overhead", fullgraph=True)
-    def linear_fp8(input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        return _LinearFp8.apply(input, weight, bias)
+@torch.compile(mode="reduce-overhead", disable=not SUPPORT_TORCH_COMPILE)
+def _linear_fp8(input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    return _LinearFp8.apply(input, weight, bias)
 
-else:
 
-    def linear_fp8(input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        return _LinearFp8.apply(input, weight, bias)
+def linear_fp8(input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    out = _linear_fp8(input, weight, bias)
+    if SUPPORT_TORCH_COMPILE:
+        # avoid modifying the tensor created from cuda graph
+        out = out.clone()
+    return out

From 597b2060013045cf0d0f0f8fddfc1b77ef716818 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Wed, 14 Aug 2024 14:08:19 +0800
Subject: [PATCH 028/167] [fp8] support asynchronous FP8 communication (#5997)

* fix

* fix

* fix

* support async all2all

* support async op for all gather

* fix

* fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 colossalai/quantization/fp8.py            | 158 +++++++++++++++-------
 tests/test_fp8/test_all_to_all_single.py  |  26 ++--
 tests/test_fp8/test_fp8_allgather_flat.py |   7 +-
 tests/test_fp8/test_fp8_allreduce.py      |  17 ++-
 tests/test_fp8/test_fp8_gather.py         |  10 +-
 5 files changed, 151 insertions(+), 67 deletions(-)

diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index fe87e317de48..4dd7db236c5d 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -10,6 +10,18 @@
 SUPPORT_TORCH_COMPILE = Version(torch.__version__) >= Version("2.3.0")
 
 
+class Handle:
+    def __init__(self, handles=[], remain_ops=None) -> None:
+        self.handles = handles
+        self.remain_ops = remain_ops
+
+    def wait(self):
+        for handle in self.handles:
+            handle.wait()
+        if self.remain_ops:
+            self.remain_ops()
+
+
 def cast_to_fp8(inp: torch.Tensor, fp8_format="e4m3", per_channel_scale=False) -> Tuple[torch.Tensor, torch.Tensor]:
     r"""
     casting torch Tensor into specified fp8 tensor with per-channel scaling or per-tensor scaling.
@@ -68,7 +80,9 @@ def cast_from_fp8(
     return ret.to(ret_type)
 
 
-def all_reduce_fp8(tensor: torch.Tensor, fp8_format="e4m3", op=ReduceOp.SUM, group=None) -> None:
+def all_reduce_fp8(
+    tensor: torch.Tensor, fp8_format="e4m3", op=ReduceOp.SUM, group=None, async_op: bool = False
+) -> Optional[Handle]:
     r"""
     This is an in-place operation for compressed all_reduce using fp8.
     It works like dist.all_reduce but during communication the data is cast to fp8 format.
@@ -105,6 +119,7 @@ def all_reduce_fp8(tensor: torch.Tensor, fp8_format="e4m3", op=ReduceOp.SUM, gro
     scale_list = [torch.ones(1, dtype=scale.dtype, device=input_device) for _ in range(world_size)]
     dist.all_gather(scale_list, scale, group=group)
     summed_out = torch.zeros_like(output_chunks[0]).to(input_type)
+
     for scale, out in zip(scale_list, output_chunks):
         out = out.view(fp8_type)
         summed_out += cast_from_fp8(out, scale, input_type)
@@ -113,19 +128,28 @@ def all_reduce_fp8(tensor: torch.Tensor, fp8_format="e4m3", op=ReduceOp.SUM, gro
         summed_out.div_(world_size)
 
     summed_out_fp8, scale = cast_to_fp8(summed_out, fp8_format=fp8_format)
-    dist.all_gather(scale_list, scale, group=group)
+    gather_scale_handle = dist.all_gather(scale_list, scale, group=group, async_op=async_op)
 
     tensor_list = [torch.empty_like(summed_out_fp8.view(torch.uint8)) for _ in range(world_size)]
-    dist.all_gather(tensor_list, summed_out_fp8.view(torch.uint8), group=group)
-    for i in range(world_size):
-        tensor_list[i] = tensor_list[i].view(fp8_type).to(input_type) * scale_list[i].to(input_device)
-    out = torch.cat(tensor_list, dim=0)
-    tensor.copy_(out[:input_size].view(input_shape).to(input_type))
+    gather_tensor_handle = dist.all_gather(
+        tensor_list, summed_out_fp8.view(torch.uint8), group=group, async_op=async_op
+    )
+
+    def cat_op():
+        for i in range(world_size):
+            tensor_list[i] = tensor_list[i].view(fp8_type).to(input_type) * scale_list[i]
+        out = torch.cat(tensor_list, dim=0)
+        tensor.copy_(out[:input_size].view(input_shape).to(input_type))
+
+    if async_op:
+        return Handle([gather_scale_handle, gather_tensor_handle], cat_op)
+    else:
+        cat_op()
 
 
 def all_to_all_single_fp8(
     output, input, output_split_sizes=None, input_split_sizes=None, fp8_format="e5m2", group=None, async_op=False
-) -> None:
+) -> Optional[Handle]:
     r"""
     This is an in-place operation for compressed all_reduce using fp8.
     It works like dist.all_to_all_single but during communication the data is cast to fp8 format.
@@ -163,20 +187,27 @@ def all_to_all_single_fp8(
         else:
             output_chunks = [torch.empty_like(input_chunks[0]) for _ in range(world_size)]
 
-    dist.all_to_all(output_chunks, input_chunks, group=group)
+    chunk_handle = dist.all_to_all(output_chunks, input_chunks, group=group, async_op=async_op)
     scale_list = [torch.ones(1, dtype=scale.dtype, device=input_device) for _ in range(world_size)]
-    dist.all_gather(scale_list, scale, group=group)
-    cast_output_chunk = [
-        cast_from_fp8(out.view(fp8_type), scale, input_type) for scale, out in zip(scale_list, output_chunks)
-    ]
+    scale_hanle = dist.all_gather(scale_list, scale, group=group, async_op=async_op)
 
-    tensor_out = torch.cat(cast_output_chunk, dim=0)
-    outputs_shape = list(input_shape)
-    if output_split_sizes is not None:
-        outputs_shape[0] = sum(output_split_sizes)
+    def cast_op():
+        cast_output_chunk = [
+            cast_from_fp8(out.view(fp8_type), scale, input_type) for scale, out in zip(scale_list, output_chunks)
+        ]
+
+        tensor_out = torch.cat(cast_output_chunk, dim=0)
+        outputs_shape = list(input_shape)
+        if output_split_sizes is not None:
+            outputs_shape[0] = sum(output_split_sizes)
+        else:
+            outputs_shape = input_shape
+        output.data = tensor_out.view(outputs_shape).to(input_type)
+
+    if async_op:
+        return Handle([chunk_handle, scale_hanle], cast_op)
     else:
-        outputs_shape = input_shape
-    output.data = tensor_out.view(outputs_shape).to(input_type)
+        cast_op()
 
 
 def cast_to_fp8_pipeline(inp: Any) -> None:
@@ -250,7 +281,9 @@ def cast_from_fp8_pipeline(inp: Any, del_metadata=True) -> None:
         del inp["dtype"]
 
 
-def reduce_scatter_fp8(output: torch.Tensor, input_list, group, fp8_format="e5m2") -> None:
+def reduce_scatter_fp8(
+    output: torch.Tensor, input_list, group, fp8_format="e5m2", async_op: bool = False
+) -> Optional[Handle]:
     r"""
     This is an in-place operation for compressed reduce_scatter using fp8.
     It works like dist.reduce_scatter but during communication the data is cast to fp8 format.
@@ -277,14 +310,20 @@ def reduce_scatter_fp8(output: torch.Tensor, input_list, group, fp8_format="e5m2
         cast_input_list.append(ret)
         output_chunks.append(torch.empty_like(ret))
         output_scale_list.append(torch.empty_like(scale))
-    dist.all_to_all(output_chunks, cast_input_list, group=group)
-    dist.all_to_all(output_scale_list, scale_list, group=group)
+    chunk_handle = dist.all_to_all(output_chunks, cast_input_list, group=group, async_op=async_op)
+    scale_handle = dist.all_to_all(output_scale_list, scale_list, group=group, async_op=async_op)
 
-    summed_out = torch.zeros_like(output_chunks[0]).to(input_type)
-    for scale, out in zip(output_scale_list, output_chunks):
-        out = out.view(fp8_type)
-        summed_out += cast_from_fp8(out, scale, input_type)
-    output.data = summed_out
+    def cast_op():
+        summed_out = torch.zeros_like(output_chunks[0]).to(input_type)
+        for scale, out in zip(output_scale_list, output_chunks):
+            out = out.view(fp8_type)
+            summed_out += cast_from_fp8(out, scale, input_type)
+        output.data = summed_out
+
+    if async_op:
+        return Handle([chunk_handle, scale_handle], cast_op)
+    else:
+        cast_op()
 
 
 def fp8_compress_ddp_grad_comm_hook_async(
@@ -500,7 +539,8 @@ def all_gather_into_tensor_flat_fp8(
     output_shape: torch.Size,
     group: dist.ProcessGroup,
     fp8_format: str = "e4m3",
-):
+    async_op: bool = False,
+) -> Optional[Handle]:
     """all gather into tensor in fp8 format
 
     Args:
@@ -547,15 +587,25 @@ def all_gather_into_tensor_flat_fp8(
         scale = fp8_max / per_tensor_max
         fp8_input = (scale * input_tensor.float()).to(fp8_type)
     scale_inv = 1.0 / scale
+
     buffer = torch.empty_like(output_tensor, dtype=fp8_type)
-    dist.all_gather_into_tensor(buffer.view(torch.uint8), fp8_input.view(torch.uint8), group=group)
-    numel = output_shape.numel()
-    valid_buffer = buffer[:numel].reshape(output_shape)
-    valid_buffer = cast_from_fp8(valid_buffer, scale_inv, input_type, per_channel_scale=(len(output_shape) == 2))
-    output_tensor[:numel].copy_(valid_buffer.view(-1))
+    tensor_handle = dist.all_gather_into_tensor(
+        buffer.view(torch.uint8), fp8_input.view(torch.uint8), group=group, async_op=async_op
+    )
+
+    def cast_op():
+        numel = output_shape.numel()
+        valid_buffer = buffer[:numel].reshape(output_shape)
+        valid_buffer = cast_from_fp8(valid_buffer, scale_inv, input_type, per_channel_scale=(len(output_shape) == 2))
+        output_tensor[:numel].copy_(valid_buffer.view(-1))
+
+    if async_op:
+        return Handle([tensor_handle], cast_op)
+    else:
+        cast_op()
 
 
-def all_to_all_fp8(output_list, input_list, group=None, fp8_format="e5m2"):
+def all_to_all_fp8(output_list, input_list, group=None, fp8_format="e5m2", async_op=False):
 
     world_size = dist.get_world_size(group)
 
@@ -573,17 +623,23 @@ def all_to_all_fp8(output_list, input_list, group=None, fp8_format="e5m2"):
 
     output_scale_list = [torch.empty_like(x) for x in scale_list]
     output_tensor_list = [torch.empty_like(x) for x in tensor_list]
-    dist.all_to_all(output_tensor_list, tensor_list, group=group)
-    dist.all_to_all(output_scale_list, scale_list, group=group)
+    tensor_hanle = dist.all_to_all(output_tensor_list, tensor_list, group=group, async_op=async_op)
+    scale_handle = dist.all_to_all(output_scale_list, scale_list, group=group, async_op=async_op)
 
-    for i in range(world_size):
-        scale = output_scale_list[i]
-        tensor = output_tensor_list[i]
-        tensor = tensor.view(fp8_type)
-        output_list[i].copy_(cast_from_fp8(tensor, scale, input_type))
+    def cast_op():
+        for i in range(world_size):
+            scale = output_scale_list[i]
+            tensor = output_tensor_list[i]
+            tensor = tensor.view(fp8_type)
+            output_list[i].copy_(cast_from_fp8(tensor, scale, input_type))
+
+    if async_op:
+        return Handle([tensor_hanle, scale_handle], cast_op)
+    else:
+        cast_op()
 
 
-def gather_fp8(output_list, input_, group=None, fp8_format="e5m2"):
+def gather_fp8(output_list, input_, group=None, fp8_format="e5m2", async_op: bool = False) -> Optional[Handle]:
 
     world_size = dist.get_world_size(group)
 
@@ -593,13 +649,19 @@ def gather_fp8(output_list, input_, group=None, fp8_format="e5m2"):
     input_ = ret.view(torch.uint8)
     tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
     scale_list = [torch.ones(1, dtype=scale.dtype, device=input_.device) for _ in range(world_size)]
-    dist.all_gather(tensor_list, input_, group=group)
-    dist.all_gather(scale_list, scale, group=group)
+    chunk_handle = dist.all_gather(tensor_list, input_, group=group, async_op=async_op)
+    scale_hanle = dist.all_gather(scale_list, scale, group=group, async_op=async_op)
 
-    for i in range(world_size):
-        output = tensor_list[i].view(fp8_type)
-        scale = scale_list[i]
-        output_list[i].copy_(cast_from_fp8(output, scale, input_type))
+    def cast_op():
+        for i in range(world_size):
+            output = tensor_list[i].view(fp8_type)
+            scale = scale_list[i]
+            output_list[i].copy_(cast_from_fp8(output, scale, input_type))
+
+    if async_op:
+        return Handle([chunk_handle, scale_hanle], cast_op)
+    else:
+        cast_op()
 
 
 class _LinearFp8(torch.autograd.Function):
diff --git a/tests/test_fp8/test_all_to_all_single.py b/tests/test_fp8/test_all_to_all_single.py
index 88becd3f07fc..722cbce9ac02 100644
--- a/tests/test_fp8/test_all_to_all_single.py
+++ b/tests/test_fp8/test_all_to_all_single.py
@@ -10,19 +10,24 @@
 
 
 @parameterize("shape", [(4,), (1, 8, 16), (4, 8, 16)])
-@parameterize("dtype", [torch.bfloat16])
-def check_all2all(shape, dtype):
+@parameterize("dtype", [torch.bfloat16, torch.float16])
+@parameterize("async_op", [True, False])
+def check_all2all(shape, dtype, async_op):
     x = torch.rand(shape, dtype=dtype, device=get_accelerator().get_current_device())
     output = torch.empty_like(x)
     output_fp8 = torch.empty_like(x)
-    dist.all_to_all_single(output, x, group=_get_default_group(), async_op=False)
-    all_to_all_single_fp8(output_fp8, x, group=_get_default_group(), async_op=False)
+    origin_hanle = dist.all_to_all_single(output, x, group=_get_default_group(), async_op=async_op)
+    fp8_handle = all_to_all_single_fp8(output_fp8, x, group=_get_default_group(), async_op=async_op)
+    if async_op:
+        origin_hanle.wait()
+        fp8_handle.wait()
     assert_close(output, output_fp8, rtol=0.1, atol=0.1)
 
 
 @parameterize("shape", [(8, 8, 16)])
 @parameterize("dtype", [torch.bfloat16, torch.float16])
-def check_all2all_uneven(shape, dtype):
+@parameterize("async_op", [True, False])
+def check_all2all_uneven(shape, dtype, async_op):
     x = torch.rand(shape, dtype=dtype, device=get_accelerator().get_current_device())
     input_split_sizes = [3, 3, 1, 1]
     if dist.get_rank() in [0, 1]:
@@ -33,22 +38,25 @@ def check_all2all_uneven(shape, dtype):
     output_shape[0] = sum(output_split_sizes)
     output = torch.empty(output_shape, device=x.device, dtype=x.dtype)
     output_fp8 = torch.empty(output_shape, device=x.device, dtype=x.dtype)
-    dist.all_to_all_single(
+    origin_hanle = dist.all_to_all_single(
         output,
         x,
         output_split_sizes=output_split_sizes,
         input_split_sizes=input_split_sizes,
         group=_get_default_group(),
-        async_op=False,
+        async_op=async_op,
     )
-    all_to_all_single_fp8(
+    fp8_handle = all_to_all_single_fp8(
         output_fp8,
         x,
         output_split_sizes=output_split_sizes,
         input_split_sizes=input_split_sizes,
         group=_get_default_group(),
-        async_op=False,
+        async_op=async_op,
     )
+    if async_op:
+        origin_hanle.wait()
+        fp8_handle.wait()
     assert_close(output, output_fp8, rtol=0.1, atol=0.1)
 
 
diff --git a/tests/test_fp8/test_fp8_allgather_flat.py b/tests/test_fp8/test_fp8_allgather_flat.py
index 35e8796c2882..2d43e5bd5902 100644
--- a/tests/test_fp8/test_fp8_allgather_flat.py
+++ b/tests/test_fp8/test_fp8_allgather_flat.py
@@ -12,7 +12,8 @@
 
 @parameterize("shape", [(3, 7), (2, 1), (1, 2), (2, 2), (4, 2), (5,), (4,), (2,)])
 @parameterize("dtype", [torch.bfloat16, torch.float16])
-def check_4gpu(shape, dtype):
+@parameterize("async_op", [True, False])
+def check_4gpu(shape, dtype, async_op):
     world_size = dist.get_world_size()
     rank = dist.get_rank()
     x = torch.rand(shape, dtype=dtype, device=get_accelerator().get_current_device())
@@ -22,7 +23,9 @@ def check_4gpu(shape, dtype):
         flat_padded_x = F.pad(flat_padded_x, (0, pad_size))
     output = torch.empty_like(flat_padded_x)
     chunk = flat_padded_x.chunk(world_size)[rank].clone()
-    all_gather_into_tensor_flat_fp8(output, chunk, x.shape, group=_get_default_group())
+    handle = all_gather_into_tensor_flat_fp8(output, chunk, x.shape, group=_get_default_group(), async_op=async_op)
+    if async_op:
+        handle.wait()
     assert_close(output[: x.numel()], x.view(-1), rtol=0.1, atol=0.1)
 
 
diff --git a/tests/test_fp8/test_fp8_allreduce.py b/tests/test_fp8/test_fp8_allreduce.py
index c23959b5d0da..ccc43ed2979f 100644
--- a/tests/test_fp8/test_fp8_allreduce.py
+++ b/tests/test_fp8/test_fp8_allreduce.py
@@ -22,15 +22,22 @@
 )
 @parameterize("dtype", [torch.float16, torch.bfloat16])
 @parameterize("fp8_format", ["e4m3", "e5m2"])
-def check_4gpu(shape, dtype, fp8_format):
+@parameterize("async_op", [True, False])
+def check_4gpu(shape, dtype, fp8_format, async_op):
     x = torch.rand(shape, dtype=dtype, device=get_accelerator().get_current_device())
     x_fp8 = x.clone()
-    dist.all_reduce(x)
-    all_reduce_fp8(x_fp8, fp8_format=fp8_format)
+    origin_handle = dist.all_reduce(x, async_op=async_op)
+    fp8_handle = all_reduce_fp8(x_fp8, fp8_format=fp8_format, async_op=async_op)
+    if async_op:
+        origin_handle.wait()
+        fp8_handle.wait()
     assert_close(x, x_fp8, rtol=0.1, atol=0.1)
 
-    dist.all_reduce(x, op=dist.ReduceOp.AVG)
-    all_reduce_fp8(x_fp8, op=dist.ReduceOp.AVG, fp8_format=fp8_format)
+    origin_handle = dist.all_reduce(x, op=dist.ReduceOp.AVG, async_op=async_op)
+    fp8_handle = all_reduce_fp8(x_fp8, op=dist.ReduceOp.AVG, fp8_format=fp8_format, async_op=async_op)
+    if async_op:
+        origin_handle.wait()
+        fp8_handle.wait()
     assert_close(x, x_fp8, rtol=0.1, atol=0.1)
 
 
diff --git a/tests/test_fp8/test_fp8_gather.py b/tests/test_fp8/test_fp8_gather.py
index 79d1d4ea49e6..40c2ccb9a17b 100644
--- a/tests/test_fp8/test_fp8_gather.py
+++ b/tests/test_fp8/test_fp8_gather.py
@@ -24,13 +24,17 @@
 )
 @parameterize("dtype", [torch.bfloat16, torch.float16])
 @parameterize("fp8_format", ["e4m3", "e5m2"])
-def check_4gpu(shape, dtype, fp8_format):
+@parameterize("async_op", [True, False])
+def check_4gpu(shape, dtype, fp8_format, async_op):
     world_size = dist.get_world_size()
     x = torch.rand(shape, dtype=dtype, device=get_accelerator().get_current_device())
     output_list = [torch.empty_like(x) for _ in range(world_size)]
     output_list_fp8 = [torch.empty_like(x) for _ in range(world_size)]
-    gather_fp8(output_list_fp8, x, group=_get_default_group(), fp8_format=fp8_format)
-    dist.all_gather(output_list, x, group=_get_default_group())
+    fp8_handle = gather_fp8(output_list_fp8, x, group=_get_default_group(), fp8_format=fp8_format, async_op=async_op)
+    origin_hanle = dist.all_gather(output_list, x, group=_get_default_group(), async_op=async_op)
+    if async_op:
+        fp8_handle.wait()
+        origin_hanle.wait()
     assert_close(output_list, output_list_fp8, rtol=0.1, atol=0.1)
 
 

From 88fa096d789578d63ac988f6e6f4788c94e59598 Mon Sep 17 00:00:00 2001
From: botbw <wang1570@e.ntu.edu.sg>
Date: Thu, 15 Aug 2024 10:14:42 +0800
Subject: [PATCH 029/167] [fp8] update torch.compile for linear_fp8 to >= 2.4.0
 (#6004)

---
 colossalai/quantization/fp8.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index 4dd7db236c5d..8ada42935e06 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -7,7 +7,7 @@
 from packaging.version import Version
 from torch.distributed import ReduceOp
 
-SUPPORT_TORCH_COMPILE = Version(torch.__version__) >= Version("2.3.0")
+SUPPORT_TORCH_COMPILE = Version(torch.__version__) >= Version("2.4.0")
 
 
 class Handle:

From 1a2e90dcc13e64f3bf273d36315b26095c0dd64c Mon Sep 17 00:00:00 2001
From: botbw <wang1570@e.ntu.edu.sg>
Date: Thu, 15 Aug 2024 03:12:08 +0000
Subject: [PATCH 030/167] [fp8] linear perf enhancement

---
 colossalai/quantization/fp8.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index 8ada42935e06..5b606616e97a 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -728,14 +728,11 @@ def backward(ctx: Any, out_grad) -> Any:
         return x_grad.reshape(ctx.x_shape), w_grad, bias_grad
 
 
-@torch.compile(mode="reduce-overhead", disable=not SUPPORT_TORCH_COMPILE)
+@torch.compile(mode="max-autotune-no-cudagraphs", disable=not SUPPORT_TORCH_COMPILE, dynamic=False)
 def _linear_fp8(input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor:
     return _LinearFp8.apply(input, weight, bias)
 
 
 def linear_fp8(input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor:
     out = _linear_fp8(input, weight, bias)
-    if SUPPORT_TORCH_COMPILE:
-        # avoid modifying the tensor created from cuda graph
-        out = out.clone()
     return out

From 20722a8c93e868f5ce998e610aa8e96a410575f5 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Thu, 15 Aug 2024 14:40:54 +0800
Subject: [PATCH 031/167] [fp8]update reduce-scatter test (#6002)

* fix

* fix

* fix

* fix
---
 tests/test_fp8/test_fp8_reduce_scatter.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/test_fp8/test_fp8_reduce_scatter.py b/tests/test_fp8/test_fp8_reduce_scatter.py
index c18446e39ea0..e0b558a257ed 100644
--- a/tests/test_fp8/test_fp8_reduce_scatter.py
+++ b/tests/test_fp8/test_fp8_reduce_scatter.py
@@ -13,14 +13,20 @@
 @parameterize("scatter_dim", [0, 1, 2])
 @parameterize("dtype", [torch.bfloat16, torch.float16])
 @parameterize("fp8_format", ["e4m3", "e5m2"])
-def check_4gpu(shape, scatter_dim, dtype, fp8_format):
+@parameterize("async_op", [True, False])
+def check_4gpu(shape, scatter_dim, dtype, fp8_format, async_op):
     x = torch.rand(shape, dtype=dtype, device=get_accelerator().get_current_device())
     input_list = list(torch.chunk(x, dim=scatter_dim, chunks=4))
     input_list = [t.contiguous() for t in input_list]
     output_origin = torch.empty_like(input_list[0])
     output_fp8 = torch.empty_like(input_list[0])
-    reduce_scatter(output_origin, input_list, group=_get_default_group())
-    reduce_scatter_fp8(output_fp8, input_list, group=_get_default_group(), fp8_format=fp8_format)
+    origin_handle = reduce_scatter(output_origin, input_list, group=_get_default_group(), async_op=async_op)
+    fp8_handle = reduce_scatter_fp8(
+        output_fp8, input_list, group=_get_default_group(), fp8_format=fp8_format, async_op=async_op
+    )
+    if async_op:
+        origin_handle.wait()
+        fp8_handle.wait()
     assert_close(output_origin, output_fp8, rtol=0.1, atol=0.1)
 
 

From 3f09a6145f7c68350a508296c36fa77814e6010d Mon Sep 17 00:00:00 2001
From: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Date: Fri, 16 Aug 2024 10:12:50 +0800
Subject: [PATCH 032/167]  [fp8] add use_fp8 option for MoeHybridParallelPlugin
 (#6009)

---
 colossalai/booster/plugin/moe_hybrid_parallel_plugin.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
index ca3a68373184..374fc653556e 100644
--- a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
@@ -215,6 +215,7 @@ def __init__(
         overlap_p2p: bool = True,
         overlap_allgather: bool = False,
         fp8_communication: bool = False,
+        use_fp8: bool = False,
     ) -> None:
         if overlap_communication or zero_stage == 2:
             overlap_communication = False
@@ -324,7 +325,7 @@ def __init__(
             self.sp_group = self.pg_mesh.get_group_along_axis(self.tp_axis)
         else:
             self.sp_group = self.pg_mesh.get_group_along_axis(self.sp_axis)
-
+        self.use_fp8 = use_fp8
         self.shard_config = ShardConfig(
             tensor_parallel_process_group=self.tp_group,
             sequence_parallel_process_group=self.sp_group,
@@ -428,6 +429,7 @@ def configure(
                 use_ddp=use_ddp,
                 ddp_config=self.ddp_config,
                 custom_policy=self.custom_policy,
+                use_fp8=self.use_fp8,
             )
         if optimizer is not None and not isinstance(optimizer, OptimizerWrapper):
             if self.ep_size > 1:

From 0a51319113d57f69555454facfe4e01b09915648 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Fri, 16 Aug 2024 10:13:07 +0800
Subject: [PATCH 033/167] [fp8] zero support fp8 linear. (#6006)

* fix

* fix

* fix

* zero fp8

* zero fp8

* Update requirements.txt
---
 .../booster/plugin/low_level_zero_plugin.py   | 19 ++++++++++++++++---
 examples/language/llama/benchmark.py          |  1 -
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py
index 64f264f7eba1..63d46f6f8a2e 100644
--- a/colossalai/booster/plugin/low_level_zero_plugin.py
+++ b/colossalai/booster/plugin/low_level_zero_plugin.py
@@ -35,6 +35,7 @@
 from colossalai.interface.optimizer import DistributedOptim
 from colossalai.nn.optimizer import DistGaloreAwamW, cast_to_distributed
 from colossalai.quantization import BnbQuantizationConfig, quantize_model
+from colossalai.quantization.fp8_hook import FP8Hook
 from colossalai.tensor.colo_parameter import ColoParameter
 from colossalai.tensor.param_op_hook import ColoParamOpHookManager
 from colossalai.zero import LowLevelZeroOptimizer
@@ -62,7 +63,9 @@ class OptimizerParamCheckState(enum.Enum):
 
 
 class LowLevelZeroModel(ModelWrapper, AMPModelMixin):
-    def __init__(self, module: nn.Module, precision: str, overlap_allgather: bool = False) -> None:
+    def __init__(
+        self, module: nn.Module, precision: str, overlap_allgather: bool = False, use_fp8: bool = False
+    ) -> None:
         super().__init__(module)
         self.dtype = None
         if precision == "fp16":
@@ -74,11 +77,16 @@ def __init__(self, module: nn.Module, precision: str, overlap_allgather: bool =
         module = module.to(get_accelerator().get_current_device())
         self.module = module
         self.convert_fn = None
+        self.use_fp8 = use_fp8
         if self.dtype is not None:
             self.convert_fn = partial(_convert_floating_point, dtype=self.dtype)
         self.overlap_allgather = overlap_allgather
+        self.op_hooks = []
         if overlap_allgather:
-            self.op_hook = ZeroOpHook()
+            self.op_hooks.append(ZeroOpHook())
+        if use_fp8:
+            self.op_hooks.append(FP8Hook())
+        if overlap_allgather or use_fp8:
             for p in module.parameters():
                 if p.requires_grad and type(p) is not ColoParameter:
                     p.__class__ = ColoParameter
@@ -335,6 +343,7 @@ def __init__(
         master_weights: bool = True,
         verbose: bool = False,
         fp8_communication: bool = False,
+        use_fp8: bool = False,
     ) -> None:
         super().__init__()
         assert stage in (1, 2), f"LowLevelZeroPlugin only supports stage 1/2 training"
@@ -362,6 +371,7 @@ def __init__(
         )
         self.lora_enabled = False
         self.verbose = verbose
+        self.use_fp8 = use_fp8
 
         # set class name with stage, for better error message
         setattr(self.__class__, "__name__", f"LowLevelZeroPlugin_ZeRO-{stage}")
@@ -476,7 +486,10 @@ def configure(
 
         if not isinstance(model, ModelWrapper):
             model = LowLevelZeroModel(
-                model, self.precision, overlap_allgather=self.zero_optim_kwargs["overlap_allgather"]
+                model,
+                self.precision,
+                overlap_allgather=self.zero_optim_kwargs["overlap_allgather"],
+                use_fp8=self.use_fp8,
             )
 
         # TODO: Support Galore + ZeRO
diff --git a/examples/language/llama/benchmark.py b/examples/language/llama/benchmark.py
index 07583161b6fb..21d081145cd9 100644
--- a/examples/language/llama/benchmark.py
+++ b/examples/language/llama/benchmark.py
@@ -259,7 +259,6 @@ def empty_init():
         if isinstance(plugin, (GeminiPlugin, HybridParallelPlugin))
         else nullcontext()
     )
-
     init_kwargs = {}
     if config.model_type == "chatglm":
         init_kwargs["empty_init"] = False

From 81272e9d005ab2aeebe5ddd24d671d3dd93e2b61 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 17 Aug 2024 09:37:37 +0000
Subject: [PATCH 034/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../coati/dataset/tokenization_utils.py       |  2 +-
 .../ColossalChat/coati/trainer/dpo.py         |  2 +-
 .../ColossalChat/coati/trainer/kto.py         |  2 +-
 .../ColossalChat/coati/trainer/orpo.py        |  2 +-
 applications/ColossalChat/examples/README.md  |  2 +-
 .../examples/training_scripts/train_kto.py    |  2 +-
 .../examples/training_scripts/train_orpo.py   |  2 +-
 applications/ColossalChat/requirements.txt    |  2 +-
 applications/ColossalChat/tests/test_train.sh |  2 +-
 .../booster/plugin/low_level_zero_plugin.py   |  7 ++++++-
 colossalai/shardformer/layer/loss.py          |  1 +
 colossalai/shardformer/modeling/chatglm2.py   |  5 -----
 colossalai/shardformer/modeling/command.py    |  2 +-
 colossalai/shardformer/modeling/deepseek.py   |  6 ++++--
 colossalai/shardformer/modeling/llama.py      | 20 +++++++++++++------
 colossalai/shardformer/modeling/mixtral.py    |  4 +++-
 colossalai/shardformer/policies/mixtral.py    |  2 +-
 17 files changed, 39 insertions(+), 26 deletions(-)

diff --git a/applications/ColossalChat/coati/dataset/tokenization_utils.py b/applications/ColossalChat/coati/dataset/tokenization_utils.py
index d2bc38aa086e..020432b9ec3c 100755
--- a/applications/ColossalChat/coati/dataset/tokenization_utils.py
+++ b/applications/ColossalChat/coati/dataset/tokenization_utils.py
@@ -392,4 +392,4 @@ def tokenize_kto(
         "label": data_point["label"],
         "input_id_decode": decoded_full_prompt,
         "completion_decode": decoded_completion,
-    }
\ No newline at end of file
+    }
diff --git a/applications/ColossalChat/coati/trainer/dpo.py b/applications/ColossalChat/coati/trainer/dpo.py
index c6a174317c35..24ddca6545c8 100755
--- a/applications/ColossalChat/coati/trainer/dpo.py
+++ b/applications/ColossalChat/coati/trainer/dpo.py
@@ -356,4 +356,4 @@ def _eval(self, epoch: int):
         os.makedirs(self.save_dir, exist_ok=True)
         with open(os.path.join(self.save_dir, f"eval_result_epoch{epoch}.txt"), "w") as f:
             f.write(msg)
-        step_bar.close()
\ No newline at end of file
+        step_bar.close()
diff --git a/applications/ColossalChat/coati/trainer/kto.py b/applications/ColossalChat/coati/trainer/kto.py
index 11fda0aa8290..6462ba816686 100755
--- a/applications/ColossalChat/coati/trainer/kto.py
+++ b/applications/ColossalChat/coati/trainer/kto.py
@@ -346,4 +346,4 @@ def _eval(self, epoch: int):
         os.makedirs(self.save_dir, exist_ok=True)
         with open(os.path.join(self.save_dir, f"eval_result_epoch{epoch}.txt"), "w") as f:
             f.write(msg)
-        step_bar.close()
\ No newline at end of file
+        step_bar.close()
diff --git a/applications/ColossalChat/coati/trainer/orpo.py b/applications/ColossalChat/coati/trainer/orpo.py
index 948232bff096..c2f75771cdff 100644
--- a/applications/ColossalChat/coati/trainer/orpo.py
+++ b/applications/ColossalChat/coati/trainer/orpo.py
@@ -323,4 +323,4 @@ def _eval(self, epoch: int):
         os.makedirs(self.save_dir, exist_ok=True)
         with open(os.path.join(self.save_dir, f"eval_result_epoch{epoch}.txt"), "w") as f:
             f.write(msg)
-        step_bar.close()
\ No newline at end of file
+        step_bar.close()
diff --git a/applications/ColossalChat/examples/README.md b/applications/ColossalChat/examples/README.md
index 7fd6011deaf4..fec7bc061270 100755
--- a/applications/ColossalChat/examples/README.md
+++ b/applications/ColossalChat/examples/README.md
@@ -903,4 +903,4 @@ For details, see [`inference/`](https://github.com/hpcaitech/ColossalAI/tree/mai
 ## Attention
 
 
-The examples are demos for the whole training process. You need to change the hyper-parameters to reach great performance.
\ No newline at end of file
+The examples are demos for the whole training process. You need to change the hyper-parameters to reach great performance.
diff --git a/applications/ColossalChat/examples/training_scripts/train_kto.py b/applications/ColossalChat/examples/training_scripts/train_kto.py
index db48e33e329d..598fd8062fcf 100755
--- a/applications/ColossalChat/examples/training_scripts/train_kto.py
+++ b/applications/ColossalChat/examples/training_scripts/train_kto.py
@@ -375,4 +375,4 @@ def train(args):
         os.makedirs(os.path.dirname(args.config_file), exist_ok=True)
         with open(args.config_file, "w") as f:
             json.dump(args.__dict__, f, indent=4)
-    train(args)
\ No newline at end of file
+    train(args)
diff --git a/applications/ColossalChat/examples/training_scripts/train_orpo.py b/applications/ColossalChat/examples/training_scripts/train_orpo.py
index fd99af828528..87860f7ea023 100755
--- a/applications/ColossalChat/examples/training_scripts/train_orpo.py
+++ b/applications/ColossalChat/examples/training_scripts/train_orpo.py
@@ -340,4 +340,4 @@ def train(args):
         os.makedirs(os.path.dirname(args.config_file), exist_ok=True)
         with open(args.config_file, "w") as f:
             json.dump(args.__dict__, f, indent=4)
-    train(args)
\ No newline at end of file
+    train(args)
diff --git a/applications/ColossalChat/requirements.txt b/applications/ColossalChat/requirements.txt
index 1978120d1f2e..ac40ae821d0a 100755
--- a/applications/ColossalChat/requirements.txt
+++ b/applications/ColossalChat/requirements.txt
@@ -20,4 +20,4 @@ datasets
 ninja==1.11.1
 sentencepiece==0.1.99
 flash-attn
-tiktoken
\ No newline at end of file
+tiktoken
diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh
index 3c47836d2309..69036de635c9 100755
--- a/applications/ColossalChat/tests/test_train.sh
+++ b/applications/ColossalChat/tests/test_train.sh
@@ -640,4 +640,4 @@ for lora_rank in ${LORA_RANK[@]}; do
             fi
         done
     done
-done
\ No newline at end of file
+done
diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py
index 533a2004df32..448fb9e2193d 100644
--- a/colossalai/booster/plugin/low_level_zero_plugin.py
+++ b/colossalai/booster/plugin/low_level_zero_plugin.py
@@ -64,7 +64,12 @@ class OptimizerParamCheckState(enum.Enum):
 
 class LowLevelZeroModel(ModelWrapper, AMPModelMixin):
     def __init__(
-        self, module: nn.Module, precision: str, overlap_allgather: bool = False, cast_inputs: bool = True, use_fp8: bool = False
+        self,
+        module: nn.Module,
+        precision: str,
+        overlap_allgather: bool = False,
+        cast_inputs: bool = True,
+        use_fp8: bool = False,
     ) -> None:
         super().__init__(module)
         self.dtype = None
diff --git a/colossalai/shardformer/layer/loss.py b/colossalai/shardformer/layer/loss.py
index 41503a6bff70..12df824d1c0c 100644
--- a/colossalai/shardformer/layer/loss.py
+++ b/colossalai/shardformer/layer/loss.py
@@ -3,6 +3,7 @@
 from torch.autograd import Function
 from torch.distributed import ProcessGroup
 from torch.nn import CrossEntropyLoss
+
 from colossalai.shardformer.layer._operation import reduce_forward
 from colossalai.shardformer.shard import ShardConfig
 
diff --git a/colossalai/shardformer/modeling/chatglm2.py b/colossalai/shardformer/modeling/chatglm2.py
index 8d8608866337..a761968af009 100644
--- a/colossalai/shardformer/modeling/chatglm2.py
+++ b/colossalai/shardformer/modeling/chatglm2.py
@@ -16,11 +16,6 @@
     gather_forward_split_backward,
     split_forward_gather_backward,
 )
-from colossalai.shardformer.layer._operation import (
-    all_to_all_comm,
-    gather_forward_split_backward,
-    split_forward_gather_backward,
-)
 
 
 def get_flash_core_attention_forward():
diff --git a/colossalai/shardformer/modeling/command.py b/colossalai/shardformer/modeling/command.py
index b3adcc5d43f2..5303383945bc 100644
--- a/colossalai/shardformer/modeling/command.py
+++ b/colossalai/shardformer/modeling/command.py
@@ -24,7 +24,7 @@
 )
 from colossalai.shardformer.shard import ShardConfig
 
-from ..layer import ColoAttention, dist_cross_entropy, cross_entropy_1d
+from ..layer import ColoAttention, dist_cross_entropy
 
 _SUPPORTED_SP_MODE = ["all_to_all", "split_gather", "ring", "ring_attn"]
 
diff --git a/colossalai/shardformer/modeling/deepseek.py b/colossalai/shardformer/modeling/deepseek.py
index bec85bd589db..59f9d4516ee7 100644
--- a/colossalai/shardformer/modeling/deepseek.py
+++ b/colossalai/shardformer/modeling/deepseek.py
@@ -145,7 +145,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         output_split_sizes = torch.zeros_like(input_split_sizes)
 
         # [n0, n1, n2, n3] [m0, m1, m2, m3] -> [n0, n1, m0, m1] [n2, n3, m2, m3]
-        dist.all_to_all_single(output_split_sizes, input_split_sizes, group=self.ep_group, fp8_communication=fp8_communication)
+        dist.all_to_all_single(
+            output_split_sizes, input_split_sizes, group=self.ep_group, fp8_communication=fp8_communication
+        )
 
         with torch.no_grad():
             activate_experts = output_split_sizes[: self.num_experts_per_ep].clone()
@@ -694,7 +696,7 @@ def forward(
 
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
-        
+
         # TODO: upgrade transformers to 4.44.0 to fix the bug, remove the hard code.
         self._use_flash_attention_2 = shard_config.enable_flash_attention
         self._use_sdpa = False if shard_config.enable_flash_attention else self._use_sdpa
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index 4fdca2d3c637..71d8daa35309 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -26,11 +26,15 @@
 
 from colossalai.pipeline.stage_manager import PipelineStageManager
 from colossalai.shardformer.layer import AttnMaskType
-from colossalai.shardformer.layer._operation import all_to_all_comm, gather_forward_split_backward, split_forward_gather_backward
+from colossalai.shardformer.layer._operation import (
+    all_to_all_comm,
+    gather_forward_split_backward,
+    split_forward_gather_backward,
+)
 from colossalai.shardformer.layer.utils import is_share_sp_tp, split_batch_zigzag
 from colossalai.shardformer.shard import ShardConfig
 
-from ..layer import ColoAttention, RingAttention, dist_cross_entropy, cross_entropy_1d
+from ..layer import ColoAttention, RingAttention, dist_cross_entropy
 
 _SUPPORTED_SP_MODE = ["all_to_all", "split_gather", "ring", "ring_attn"]
 
@@ -162,9 +166,13 @@ def llama_model_forward(
                     hidden_states, position_ids = split_batch_zigzag([hidden_states, position_ids], sp_group)
 
             elif is_share_sp_tp(sp_mode):
-                hidden_states = split_forward_gather_backward(hidden_states, 1, sp_group, fp8_communication=shard_config.fp8_communication)
+                hidden_states = split_forward_gather_backward(
+                    hidden_states, 1, sp_group, fp8_communication=shard_config.fp8_communication
+                )
             elif sp_mode == "all_to_all":
-                hidden_states = split_forward_gather_backward(hidden_states, 1, sp_group, 1 / sp_size, fp8_communication=shard_config.fp8_communication)
+                hidden_states = split_forward_gather_backward(
+                    hidden_states, 1, sp_group, 1 / sp_size, fp8_communication=shard_config.fp8_communication
+                )
 
         if self.gradient_checkpointing and self.training and use_cache:
             if use_cache:
@@ -355,7 +363,7 @@ def llama_for_causal_lm_forward(
             loss = dist_cross_entropy(
                 labels, logits, shard_config, self.lm_head.out_features, self.config.vocab_size, self.model.dtype
             )
-            
+
             if not return_dict:
                 output = (logits,) + outputs[1:]
                 return (loss,) + output if loss is not None else output
@@ -675,7 +683,7 @@ def forward(
 
         past_seen_tokens = 0
         seq_len = inputs_embeds.shape[1]
-        batch_size = inputs_embeds.shape[0]
+        inputs_embeds.shape[0]
         if use_cache:  # kept for BC (cache positions)
             if not isinstance(past_key_values, StaticCache):
                 past_key_values = DynamicCache.from_legacy_cache(past_key_values)
diff --git a/colossalai/shardformer/modeling/mixtral.py b/colossalai/shardformer/modeling/mixtral.py
index c488d3cc446b..50334677e3fc 100644
--- a/colossalai/shardformer/modeling/mixtral.py
+++ b/colossalai/shardformer/modeling/mixtral.py
@@ -691,7 +691,9 @@ def forward(
         # sp: all-to-all comminucation when introducing sequence parallel
         if sp_mode == "all_to_all":
             attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim).contiguous()  # (1, 8, 128)
-            attn_output = all_to_all_comm(attn_output, sp_group, scatter_dim=1, gather_dim=2, fp8_communication=shard_config.fp8_communication)  # (1, 4, 256)
+            attn_output = all_to_all_comm(
+                attn_output, sp_group, scatter_dim=1, gather_dim=2, fp8_communication=shard_config.fp8_communication
+            )  # (1, 4, 256)
         else:
             attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
 
diff --git a/colossalai/shardformer/policies/mixtral.py b/colossalai/shardformer/policies/mixtral.py
index 1f19ff65d723..4bdca78cb0ac 100644
--- a/colossalai/shardformer/policies/mixtral.py
+++ b/colossalai/shardformer/policies/mixtral.py
@@ -5,7 +5,7 @@
 import torch.nn as nn
 from torch import Tensor
 from torch.nn import Module
-from transformers.models.mixtral.modeling_mixtral import MixtralDecoderLayer, MixtralForCausalLM, MixtralModel
+from transformers.models.mixtral.modeling_mixtral import MixtralForCausalLM, MixtralModel
 
 from colossalai.shardformer.layer import FusedRMSNorm, Linear1D_Col
 from colossalai.shardformer.layer.embedding import PaddingEmbedding, VocabParallelEmbedding1D

From 02636c5bef5cc9ae9c6b2c0e38e4e53e28e47060 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Mon, 19 Aug 2024 02:26:52 +0000
Subject: [PATCH 035/167] fix the merge

---
 colossalai/shardformer/layer/_operation.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index c4ef4863473a..bfe408065d96 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -767,12 +767,12 @@ class _ReduceForward(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, input_, process_group):
-        return _reduce(input_, process_group)
+    def forward(ctx, input_, process_group, fp8_communication=False):
+        return _reduce(input_, process_group, fp8_communication, fp8_format="e4m3")
 
     @staticmethod
     def backward(ctx, grad_output):
-        return grad_output, None
+        return grad_output, None, None
 
 
 class _ReduceBackward(torch.autograd.Function):

From 1a5847e6d1063f0f201ea3f32d5533b294fed0fa Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Mon, 19 Aug 2024 03:28:29 +0000
Subject: [PATCH 036/167] fix the merge

---
 colossalai/shardformer/layer/_operation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index bfe408065d96..aed9d8351108 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -1082,8 +1082,8 @@ def split_forward_gather_backward(input_, dim, process_group, grad_scale=None, f
     return _SplitForwardGatherBackward.apply(input_, dim, process_group, grad_scale, fp8_communication)
 
 
-def reduce_forward(input_, process_group, grad_scale=None, fp8_communication=False):
-    return _ReduceForward.apply(input_, process_group, grad_scale, fp8_communication)
+def reduce_forward(input_, process_group, fp8_communication=False):
+    return _ReduceForward.apply(input_, process_group, fp8_communication)
 
 
 def reduce_backward(input_, process_group, fp8_communication=False):

From 33530425259d995eec8aa32b62b1a1be06511254 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Mon, 19 Aug 2024 08:07:51 +0000
Subject: [PATCH 037/167] fix the merge

---
 colossalai/zero/low_level/low_level_optim.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/colossalai/zero/low_level/low_level_optim.py b/colossalai/zero/low_level/low_level_optim.py
index 090436cbccb3..12ef9252be70 100644
--- a/colossalai/zero/low_level/low_level_optim.py
+++ b/colossalai/zero/low_level/low_level_optim.py
@@ -588,9 +588,10 @@ def step(self, closure=None):
                         self.pg_to_tensor_bucket[pg].all_gather(pg, fp8_communication=self._fp8_communication)
                         self.pg_to_tensor_bucket[pg].add_to_bucket(param_to_gather, write_back_tensor=working_param)
             self.optim.param_groups[group_id]["params"] = self._master_param_groups_of_current_rank[group_id]
-        for pg, tensor_bucket in self.pg_to_tensor_bucket.items():
-            if not tensor_bucket.is_empty():
-                tensor_bucket.all_gather(pg)
+        if not self._overlap_allgather:
+             for pg, tensor_bucket in self.pg_to_tensor_bucket.items():
+                 if not tensor_bucket.is_empty():
+                     tensor_bucket.all_gather(pg, fp8_communication=self._fp8_communication)
 
     def _compute_grad_norm(self, dp_pg: ProcessGroup, gradients: List[Tensor], norm_type: int = 2) -> float:
         r"""

From 64aad967239bc053b6a44d912a2fc94fdcbde4ff Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 19 Aug 2024 08:08:45 +0000
Subject: [PATCH 038/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 colossalai/zero/low_level/low_level_optim.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/colossalai/zero/low_level/low_level_optim.py b/colossalai/zero/low_level/low_level_optim.py
index 12ef9252be70..458e6e41a29e 100644
--- a/colossalai/zero/low_level/low_level_optim.py
+++ b/colossalai/zero/low_level/low_level_optim.py
@@ -589,9 +589,9 @@ def step(self, closure=None):
                         self.pg_to_tensor_bucket[pg].add_to_bucket(param_to_gather, write_back_tensor=working_param)
             self.optim.param_groups[group_id]["params"] = self._master_param_groups_of_current_rank[group_id]
         if not self._overlap_allgather:
-             for pg, tensor_bucket in self.pg_to_tensor_bucket.items():
-                 if not tensor_bucket.is_empty():
-                     tensor_bucket.all_gather(pg, fp8_communication=self._fp8_communication)
+            for pg, tensor_bucket in self.pg_to_tensor_bucket.items():
+                if not tensor_bucket.is_empty():
+                    tensor_bucket.all_gather(pg, fp8_communication=self._fp8_communication)
 
     def _compute_grad_norm(self, dp_pg: ProcessGroup, gradients: List[Tensor], norm_type: int = 2) -> float:
         r"""

From 4c82bfcc540bc17f5fee03445035b47dd993e9a7 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Mon, 19 Aug 2024 08:09:34 +0000
Subject: [PATCH 039/167] fix the merge

---
 colossalai/zero/low_level/low_level_optim.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/colossalai/zero/low_level/low_level_optim.py b/colossalai/zero/low_level/low_level_optim.py
index 12ef9252be70..458e6e41a29e 100644
--- a/colossalai/zero/low_level/low_level_optim.py
+++ b/colossalai/zero/low_level/low_level_optim.py
@@ -589,9 +589,9 @@ def step(self, closure=None):
                         self.pg_to_tensor_bucket[pg].add_to_bucket(param_to_gather, write_back_tensor=working_param)
             self.optim.param_groups[group_id]["params"] = self._master_param_groups_of_current_rank[group_id]
         if not self._overlap_allgather:
-             for pg, tensor_bucket in self.pg_to_tensor_bucket.items():
-                 if not tensor_bucket.is_empty():
-                     tensor_bucket.all_gather(pg, fp8_communication=self._fp8_communication)
+            for pg, tensor_bucket in self.pg_to_tensor_bucket.items():
+                if not tensor_bucket.is_empty():
+                    tensor_bucket.all_gather(pg, fp8_communication=self._fp8_communication)
 
     def _compute_grad_norm(self, dp_pg: ProcessGroup, gradients: List[Tensor], norm_type: int = 2) -> float:
         r"""

From 12b44012d95878004c4026bab637ddb563cd6759 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Mon, 19 Aug 2024 09:02:16 +0000
Subject: [PATCH 040/167] fix

---
 .../booster/plugin/hybrid_parallel_plugin.py  | 49 ++++++++++---------
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index 9e5d7b0d77f7..bd970878f1dd 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -1278,30 +1278,31 @@ def configure(
                 overlap_allgather=(self.zero_stage > 0 and self.zero_config["overlap_allgather"]),
                 use_fp8=self.use_fp8,
             )
-            if zero_stage == 0:
-                is_zero = False
-                if self.precision in ["fp16", "bf16"]:
-                    optimizer = HybridParallelAMPOptimizer(
-                        optimizer,
-                        model,
-                        use_pipeline=self.enable_pipeline_parallelism,
-                        param_info=param_info,
-                        precision=self.precision,
-                        max_norm=self.max_norm,
-                        pp_process_group=self.pp_group,
-                        tp_process_group=self.tp_group,
-                        **self.amp_config,
-                    )
-                else:
-                    optimizer = HybridParallelNaiveOptimizer(
-                        optimizer,
-                        model,
-                        use_pipeline=self.enable_pipeline_parallelism,
-                        param_info=param_info,
-                        max_norm=self.max_norm,
-                        pp_process_group=self.pp_group,
-                        tp_process_group=self.tp_group,
-                    )
+            if optimizer is not None and not isinstance(optimizer, OptimizerWrapper):
+                if zero_stage == 0:
+                    is_zero = False
+                    if self.precision in ["fp16", "bf16"]:
+                        optimizer = HybridParallelAMPOptimizer(
+                            optimizer,
+                            model,
+                            use_pipeline=self.enable_pipeline_parallelism,
+                            param_info=param_info,
+                            precision=self.precision,
+                            max_norm=self.max_norm,
+                            pp_process_group=self.pp_group,
+                            tp_process_group=self.tp_group,
+                            **self.amp_config,
+                        )
+                    else:
+                        optimizer = HybridParallelNaiveOptimizer(
+                            optimizer,
+                            model,
+                            use_pipeline=self.enable_pipeline_parallelism,
+                            param_info=param_info,
+                            max_norm=self.max_norm,
+                            pp_process_group=self.pp_group,
+                            tp_process_group=self.tp_group,
+                        )
             else:
                 is_zero = self.dp_size > 1
                 if self.dp_size == 1:

From 2eb36839c6057533d44751894e4729ec733858e7 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Mon, 19 Aug 2024 09:23:10 +0000
Subject: [PATCH 041/167] fix

---
 colossalai/moe/_operation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/moe/_operation.py b/colossalai/moe/_operation.py
index e16dd7ad100a..bfbf00b5a8f3 100644
--- a/colossalai/moe/_operation.py
+++ b/colossalai/moe/_operation.py
@@ -452,4 +452,4 @@ def all_to_all_uneven(
     assert (
         inputs.requires_grad
     ), "Input must require grad to assure that backward is executed, otherwise it might hang the program."
-    return AllToAllUneven.apply(inputs, input_split_sizes, output_split_sizes, group, overlap)
+    return AllToAllUneven.apply(inputs, input_split_sizes, output_split_sizes, group, overlap, fp8_communicatio)

From 88b3f0698c976f11ca96940aa5895c24c4d1793e Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Mon, 19 Aug 2024 10:11:27 +0000
Subject: [PATCH 042/167] fix the merge

---
 colossalai/shardformer/layer/linear.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/colossalai/shardformer/layer/linear.py b/colossalai/shardformer/layer/linear.py
index 085609c01a60..d77dd496592f 100644
--- a/colossalai/shardformer/layer/linear.py
+++ b/colossalai/shardformer/layer/linear.py
@@ -202,21 +202,21 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
         # Matrix multiply.
         bias = self.bias if not self.skip_bias_add else None
 
-        if self.seq_parallel_mode is None:
-            output_parallel = linear_with_async_comm(
-                input_parallel, self.weight, bias, self.process_group, True, fp8_communication=self.fp8_communication
-            )
-        elif self.seq_parallel_mode == "split_gather":
+        if self.seq_parallel_mode == "split_gather":
             input_parallel = gather_forward_reducescatter_backward(
                 input_parallel, self.process_group, self.seq_parallel_dim, fp8_communication=self.fp8_communication
             )
-            output_parallel = linear_with_async_comm(input_parallel, self.weight, bias, self.process_group, False)
+            output_parallel = linear_with_async_comm(
+                input_parallel, self.weight, bias, self.process_group, False, fp8_communication=self.fp8_communication
+            )
         elif self.seq_parallel_mode == "ring":
             output_parallel = linear_gather_forward_reducescatter_backward(
                 input_parallel, self.weight, bias, self.process_group, True, self.seq_parallel_dim, self.overlap, True
             )
         else:
-            output_parallel = linear_with_async_comm(input_parallel, self.weight, bias, self.process_group, True)
+            output_parallel = linear_with_async_comm(
+                input_parallel, self.weight, bias, self.process_group, True, fp8_communication=self.fp8_communication
+            )
 
         if self.gather_output:
             # All-gather across the partitions.

From 1f703e0ef48dc163beae773995ca42695da46135 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Mon, 19 Aug 2024 10:15:16 +0000
Subject: [PATCH 043/167] fix

---
 colossalai/moe/_operation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/moe/_operation.py b/colossalai/moe/_operation.py
index bfbf00b5a8f3..ba087a03b728 100644
--- a/colossalai/moe/_operation.py
+++ b/colossalai/moe/_operation.py
@@ -452,4 +452,4 @@ def all_to_all_uneven(
     assert (
         inputs.requires_grad
     ), "Input must require grad to assure that backward is executed, otherwise it might hang the program."
-    return AllToAllUneven.apply(inputs, input_split_sizes, output_split_sizes, group, overlap, fp8_communicatio)
+    return AllToAllUneven.apply(inputs, input_split_sizes, output_split_sizes, group, overlap, fp8_communication)

From 53823118f2fc91539b46442f216cd203fe7b5f60 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Tue, 20 Aug 2024 03:20:13 +0000
Subject: [PATCH 044/167] fix

---
 .../booster/plugin/hybrid_parallel_plugin.py  | 50 +++++++++----------
 .../booster/plugin/low_level_zero_plugin.py   | 11 ++--
 colossalai/shardformer/layer/_operation.py    |  4 +-
 colossalai/shardformer/modeling/llama.py      |  2 +-
 colossalai/shardformer/policies/mixtral.py    | 13 +++--
 5 files changed, 42 insertions(+), 38 deletions(-)

diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index bd970878f1dd..a92371485ee7 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -1278,31 +1278,31 @@ def configure(
                 overlap_allgather=(self.zero_stage > 0 and self.zero_config["overlap_allgather"]),
                 use_fp8=self.use_fp8,
             )
-            if optimizer is not None and not isinstance(optimizer, OptimizerWrapper):
-                if zero_stage == 0:
-                    is_zero = False
-                    if self.precision in ["fp16", "bf16"]:
-                        optimizer = HybridParallelAMPOptimizer(
-                            optimizer,
-                            model,
-                            use_pipeline=self.enable_pipeline_parallelism,
-                            param_info=param_info,
-                            precision=self.precision,
-                            max_norm=self.max_norm,
-                            pp_process_group=self.pp_group,
-                            tp_process_group=self.tp_group,
-                            **self.amp_config,
-                        )
-                    else:
-                        optimizer = HybridParallelNaiveOptimizer(
-                            optimizer,
-                            model,
-                            use_pipeline=self.enable_pipeline_parallelism,
-                            param_info=param_info,
-                            max_norm=self.max_norm,
-                            pp_process_group=self.pp_group,
-                            tp_process_group=self.tp_group,
-                        )
+        if optimizer is not None and not isinstance(optimizer, OptimizerWrapper):
+            if zero_stage == 0:
+                is_zero = False
+                if self.precision in ["fp16", "bf16"]:
+                    optimizer = HybridParallelAMPOptimizer(
+                        optimizer,
+                        model,
+                        use_pipeline=self.enable_pipeline_parallelism,
+                        param_info=param_info,
+                        precision=self.precision,
+                        max_norm=self.max_norm,
+                        pp_process_group=self.pp_group,
+                        tp_process_group=self.tp_group,
+                        **self.amp_config,
+                    )
+                else:
+                    optimizer = HybridParallelNaiveOptimizer(
+                        optimizer,
+                        model,
+                        use_pipeline=self.enable_pipeline_parallelism,
+                        param_info=param_info,
+                        max_norm=self.max_norm,
+                        pp_process_group=self.pp_group,
+                        tp_process_group=self.tp_group,
+                    )
             else:
                 is_zero = self.dp_size > 1
                 if self.dp_size == 1:
diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py
index 448fb9e2193d..088fa1daaff4 100644
--- a/colossalai/booster/plugin/low_level_zero_plugin.py
+++ b/colossalai/booster/plugin/low_level_zero_plugin.py
@@ -90,13 +90,14 @@ def __init__(
         if self.dtype is not None and cast_inputs:
             self.convert_fn = partial(_convert_floating_point, dtype=self.dtype)
         if overlap_allgather:
-            self.op_hook = ZeroOpHook()
+            self.op_hooks.append(ZeroOpHook())
+        if use_fp8:
+            self.op_hooks.append(FP8Hook())
+        if overlap_allgather or use_fp8:
             for p in module.parameters():
                 if p.requires_grad and type(p) is not ColoParameter:
                     p.__class__ = ColoParameter
                     p.__init__(p, requires_grad=True)
-        if use_fp8:
-            self.op_hooks.append(FP8Hook())
 
     def forward(self, *args, **kwargs):
         if self.convert_fn is not None:
@@ -348,9 +349,9 @@ def __init__(
         cpu_offload: bool = False,
         master_weights: bool = True,
         verbose: bool = False,
+        cast_inputs: bool = True,
         fp8_communication: bool = False,
         use_fp8: bool = False,
-        cast_inputs: bool = True,
     ) -> None:
         super().__init__()
         assert stage in (1, 2), f"LowLevelZeroPlugin only supports stage 1/2 training"
@@ -497,8 +498,8 @@ def configure(
                 model,
                 self.precision,
                 overlap_allgather=self.zero_optim_kwargs["overlap_allgather"],
-                use_fp8=self.use_fp8,
                 cast_inputs=self.cast_inputs,
+                use_fp8=self.use_fp8,
             )
 
         # TODO: Support Galore + ZeRO
diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index aed9d8351108..bfe408065d96 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -1082,8 +1082,8 @@ def split_forward_gather_backward(input_, dim, process_group, grad_scale=None, f
     return _SplitForwardGatherBackward.apply(input_, dim, process_group, grad_scale, fp8_communication)
 
 
-def reduce_forward(input_, process_group, fp8_communication=False):
-    return _ReduceForward.apply(input_, process_group, fp8_communication)
+def reduce_forward(input_, process_group, grad_scale=None, fp8_communication=False):
+    return _ReduceForward.apply(input_, process_group, grad_scale, fp8_communication)
 
 
 def reduce_backward(input_, process_group, fp8_communication=False):
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index 71d8daa35309..3f05d0428bf1 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -698,7 +698,7 @@ def forward(
 
         if shard_config.enable_flash_attention:
             mask_shape = (inputs_embeds.shape[0], 1, past_seen_tokens + seq_len, past_seen_tokens + seq_len)
-            attention_mask = ColoAttention.prepare_attn_kwargs(
+            attn_kwargs: dict = ColoAttention.prepare_attn_kwargs(
                 mask_shape,
                 inputs_embeds.dtype,
                 inputs_embeds.device,
diff --git a/colossalai/shardformer/policies/mixtral.py b/colossalai/shardformer/policies/mixtral.py
index 4bdca78cb0ac..3a373889c5c6 100644
--- a/colossalai/shardformer/policies/mixtral.py
+++ b/colossalai/shardformer/policies/mixtral.py
@@ -144,10 +144,14 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                 description=SubModuleReplacementDescription(
                     suffix="embed_tokens",
                     target_module=embedding_cls,
-                    kwargs={
-                        "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
-                        "fp8_communication": self.shard_config.fp8_communication,
-                    },
+                    kwargs=(
+                        {
+                            "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        }
+                        if self.shard_config.enable_tensor_parallelism
+                        else {"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by}
+                    ),
                 ),
                 policy=policy,
                 target_key=MixtralModel,
@@ -164,7 +168,6 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                             "ep_group": self.shard_config.ep_group,
                             "tp_group": self.shard_config.tensor_parallel_process_group,
                             "moe_dp_group": self.shard_config.moe_dp_group,
-                            "fp8_communication": self.shard_config.fp8_communication,
                         },
                     )
                 ],

From f7acfa1bd5aa648cf3cf0e00005265c9b37870dd Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Tue, 20 Aug 2024 05:07:58 +0000
Subject: [PATCH 045/167] fix

---
 colossalai/shardformer/layer/_operation.py       | 5 ++++-
 colossalai/shardformer/layer/qkv_fused_linear.py | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index bfe408065d96..ed7b352335e6 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -767,11 +767,14 @@ class _ReduceForward(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, input_, process_group, fp8_communication=False):
+    def forward(ctx, input_, process_group, grad_scale=None, fp8_communication=False):
+        ctx.grad_scale = grad_scale
         return _reduce(input_, process_group, fp8_communication, fp8_format="e4m3")
 
     @staticmethod
     def backward(ctx, grad_output):
+        if ctx.grad_scale is not None:
+            grad_output = grad_output * ctx.grad_scale
         return grad_output, None, None
 
 
diff --git a/colossalai/shardformer/layer/qkv_fused_linear.py b/colossalai/shardformer/layer/qkv_fused_linear.py
index 561867993402..f9a41a467300 100644
--- a/colossalai/shardformer/layer/qkv_fused_linear.py
+++ b/colossalai/shardformer/layer/qkv_fused_linear.py
@@ -555,7 +555,7 @@ def forward(self, input_: Tensor) -> Tensor:
         else:
             if self.seq_parallel_mode is None:
                 output_parallel = torch.matmul(input_, self.weight)
-                output = reduce_forward(output_parallel, self.process_group, self.fp8_communication)
+                output = reduce_forward(output_parallel, self.process_group, fp8_communication=self.fp8_communication)
             elif self.seq_parallel_mode == "split_gather":
                 output_parallel = torch.matmul(input_, self.weight)
                 output = reducescatter_forward_gather_backward(

From 2ee6235cfa352a8610050680353631611407bfeb Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Tue, 20 Aug 2024 06:48:16 +0000
Subject: [PATCH 046/167] fix

---
 colossalai/shardformer/layer/_operation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index ed7b352335e6..bb403224fb72 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -775,7 +775,7 @@ def forward(ctx, input_, process_group, grad_scale=None, fp8_communication=False
     def backward(ctx, grad_output):
         if ctx.grad_scale is not None:
             grad_output = grad_output * ctx.grad_scale
-        return grad_output, None, None
+        return grad_output, None, None, None
 
 
 class _ReduceBackward(torch.autograd.Function):

From 2e4cbe3a2d1dc60cfe44b99ce5a5b68433fe6b97 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Tue, 20 Aug 2024 09:11:02 +0000
Subject: [PATCH 047/167] fix

---
 .../test_shardformer/test_model/test_shard_llama.py  | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tests/test_shardformer/test_model/test_shard_llama.py b/tests/test_shardformer/test_model/test_shard_llama.py
index a91ffd00da27..3c66f609787a 100644
--- a/tests/test_shardformer/test_model/test_shard_llama.py
+++ b/tests/test_shardformer/test_model/test_shard_llama.py
@@ -153,8 +153,9 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
 @parameterize(
     "test_config",
     [
-        {  # Test ring + Flash attention
-            "tp_size": 2,
+        # Double Ring Attention
+        {
+            "tp_size": 1,
             "pp_size": 1,
             "sp_size": 4,
             "num_microbatches": 1,
@@ -166,19 +167,20 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "initial_scale": 1,
             "inner_ring_size": 2,
         },
-        {  # Ulysess + Flash attention
+        # Ring Attention + PP
+        {
             "tp_size": 1,
             "pp_size": 2,
             "sp_size": 2,
             "num_microbatches": 2,
             "enable_sequence_parallelism": True,
-            "sequence_parallelism_mode": "all_to_all",
-            "enable_flash_attention": True,
+            "sequence_parallelism_mode": "ring_attn",
             "use_lazy_init": True,
             "zero_stage": 1,
             "precision": "fp16",
             "initial_scale": 1,
         },
+        # Ring Attention + TP
         {
             "tp_size": 2,
             "pp_size": 1,

From eb5ba40defe727927abe1e8da74103ecdd6e049f Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Wed, 21 Aug 2024 02:58:23 +0000
Subject: [PATCH 048/167] fix the merge

---
 colossalai/shardformer/modeling/deepseek.py | 4 +++-
 colossalai/shardformer/modeling/mixtral.py  | 5 +++++
 colossalai/shardformer/policies/qwen2.py    | 3 ++-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/colossalai/shardformer/modeling/deepseek.py b/colossalai/shardformer/modeling/deepseek.py
index 59f9d4516ee7..7ec390d6ab98 100644
--- a/colossalai/shardformer/modeling/deepseek.py
+++ b/colossalai/shardformer/modeling/deepseek.py
@@ -146,7 +146,9 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
         # [n0, n1, n2, n3] [m0, m1, m2, m3] -> [n0, n1, m0, m1] [n2, n3, m2, m3]
         dist.all_to_all_single(
-            output_split_sizes, input_split_sizes, group=self.ep_group, fp8_communication=fp8_communication
+            output_split_sizes,
+            input_split_sizes,
+            group=self.ep_group,
         )
 
         with torch.no_grad():
diff --git a/colossalai/shardformer/modeling/mixtral.py b/colossalai/shardformer/modeling/mixtral.py
index 50334677e3fc..4850ef1b6929 100644
--- a/colossalai/shardformer/modeling/mixtral.py
+++ b/colossalai/shardformer/modeling/mixtral.py
@@ -68,6 +68,11 @@ def setup_process_groups(
         self.ep_size = dist.get_world_size(ep_group)
         self.ep_rank = dist.get_rank(ep_group)
         self.ep_group = ep_group
+        self.fp8_communication = fp8_communication
+
+        if self.num_experts % self.ep_size != 0:
+            raise ValueError("The number of experts must be divisible by the number of expert parallel groups.")
+
         self.num_experts_per_ep = self.num_experts // self.ep_size
         self.expert_start_idx = self.ep_rank * self.num_experts_per_ep
         held_experts = self.experts[self.expert_start_idx : self.expert_start_idx + self.num_experts_per_ep]
diff --git a/colossalai/shardformer/policies/qwen2.py b/colossalai/shardformer/policies/qwen2.py
index 7f82f829096c..1b066200de64 100644
--- a/colossalai/shardformer/policies/qwen2.py
+++ b/colossalai/shardformer/policies/qwen2.py
@@ -119,6 +119,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                     SubModuleReplacementDescription(
                         suffix="self_attn.q_proj",
                         target_module=Linear1D_Col,
+                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.k_proj",
@@ -319,7 +320,7 @@ def module_policy(self):
         setattr(self.shard_config, "causal_lm", True)
 
         if self.shard_config.enable_tensor_parallelism:
-            # add a new item for causal lm
+            # add a new item for casual lm
             new_item = {
                 Qwen2ForCausalLM: ModulePolicyDescription(
                     sub_module_replacement=[

From 193030f696f941b5ad36bb37cc4763f5f36dbc2e Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Wed, 21 Aug 2024 03:21:49 +0000
Subject: [PATCH 049/167] fix

---
 colossalai/shardformer/layer/_operation.py |  6 ++--
 colossalai/shardformer/modeling/llama.py   | 32 +++++++---------------
 2 files changed, 14 insertions(+), 24 deletions(-)

diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index bb403224fb72..f58a21df3614 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -1097,11 +1097,13 @@ def all_to_all_comm(input_, process_group=None, scatter_dim=2, gather_dim=1, fp8
     return _AllToAll.apply(input_, process_group, scatter_dim, gather_dim, fp8_communication)
 
 
-def gather_sp_output(hidden_states, sp_group, sp_mode):
+def gather_sp_output(hidden_states, sp_group, sp_mode, fp8_communication=False):
     """
     Gather the output of the last layer for cross entropy computation
     """
     # Rescale grad (HybridParallelPlugin applies ZeRO grad averaging on the DP * SP group)
     scale = None if is_share_sp_tp(sp_mode) else dist.get_world_size(sp_group)
-    hidden_states = gather_forward_split_backward(hidden_states, 1, sp_group, grad_scale=scale)
+    hidden_states = gather_forward_split_backward(
+        hidden_states, 1, sp_group, grad_scale=scale, fp8_communication=fp8_communication
+    )
     return hidden_states
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index 3f05d0428bf1..f18d757abba7 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -26,11 +26,7 @@
 
 from colossalai.pipeline.stage_manager import PipelineStageManager
 from colossalai.shardformer.layer import AttnMaskType
-from colossalai.shardformer.layer._operation import (
-    all_to_all_comm,
-    gather_forward_split_backward,
-    split_forward_gather_backward,
-)
+from colossalai.shardformer.layer._operation import all_to_all_comm, gather_sp_output, split_forward_gather_backward
 from colossalai.shardformer.layer.utils import is_share_sp_tp, split_batch_zigzag
 from colossalai.shardformer.shard import ShardConfig
 
@@ -235,12 +231,8 @@ def llama_model_forward(
         if stage_manager.is_last_stage():
             hidden_states = self.norm(hidden_states)
             if (not shard_config.parallel_output) or force_sp_output_gather or is_share_sp_tp(sp_mode):
-                hidden_states = gather_forward_split_backward(
-                    hidden_states, 1, sp_group, fp8_communication=shard_config.fp8_communication
-                )
-            elif sp_mode == "all_to_all":
-                hidden_states = gather_forward_split_backward(
-                    hidden_states, 1, sp_group, grad_scale=sp_size, fp8_communication=shard_config.fp8_communication
+                hidden_states = gather_sp_output(
+                    hidden_states, sp_group, sp_mode, fp8_communication=shard_config.fp8_communication
                 )
 
         # add hidden states from the last decoder layer
@@ -546,7 +538,6 @@ def forward(
 
         # sp: all-to-all comminucation when introducing sequence parallel
         if sp_mode == "all_to_all":
-            query_states = all_to_all_comm(query_states, sp_group, fp8_communication=shard_config.fp8_communication)
             key_states = all_to_all_comm(key_states, sp_group, fp8_communication=shard_config.fp8_communication)
             value_states = all_to_all_comm(value_states, sp_group, fp8_communication=shard_config.fp8_communication)
             bsz, q_len, _ = query_states.size()
@@ -565,6 +556,7 @@ def forward(
                 )
 
             kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
         cos, sin = self.rotary_emb(value_states, position_ids)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
@@ -683,7 +675,7 @@ def forward(
 
         past_seen_tokens = 0
         seq_len = inputs_embeds.shape[1]
-        inputs_embeds.shape[0]
+        batch_size = inputs_embeds.shape[0]
         if use_cache:  # kept for BC (cache positions)
             if not isinstance(past_key_values, StaticCache):
                 past_key_values = DynamicCache.from_legacy_cache(past_key_values)
@@ -697,7 +689,7 @@ def forward(
             position_ids = cache_position.unsqueeze(0)
 
         if shard_config.enable_flash_attention:
-            mask_shape = (inputs_embeds.shape[0], 1, past_seen_tokens + seq_len, past_seen_tokens + seq_len)
+            mask_shape = (batch_size, 1, seq_len, past_seen_tokens + seq_len)
             attn_kwargs: dict = ColoAttention.prepare_attn_kwargs(
                 mask_shape,
                 inputs_embeds.dtype,
@@ -771,14 +763,10 @@ def forward(
                 all_self_attns += (layer_outputs[1],)
 
         hidden_states = self.norm(hidden_states)
-
-        if sp_mode == "ring" or sp_mode == "split_gather":
-            hidden_states = gather_forward_split_backward(
-                hidden_states, 1, sp_group, fp8_communication=shard_config.fp8_communication
-            )
-        elif sp_mode == "all_to_all":
-            hidden_states = gather_forward_split_backward(
-                hidden_states, 1, sp_group, grad_scale=sp_size, fp8_communication=shard_config.fp8_communication
+        # Cases that don't support parallelizing cross entropy computation along sequence
+        if (not shard_config.parallel_output) or is_share_sp_tp(sp_mode) or force_sp_output_gather:
+            hidden_states = gather_sp_output(
+                hidden_states, sp_group, sp_mode, fp8_communication=shard_config.fp8_communication
             )
 
         # add hidden states from the last decoder layer

From 6aface9316e77349eeb23362eb562f07357d7252 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Wed, 21 Aug 2024 03:51:25 +0000
Subject: [PATCH 050/167] fix

---
 colossalai/shardformer/policies/llama.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/colossalai/shardformer/policies/llama.py b/colossalai/shardformer/policies/llama.py
index 6c5e2c2eac1c..429eec52f7bc 100644
--- a/colossalai/shardformer/policies/llama.py
+++ b/colossalai/shardformer/policies/llama.py
@@ -391,7 +391,12 @@ def module_policy(self):
                 LlamaForSequenceClassification: ModulePolicyDescription(
                     sub_module_replacement=[
                         SubModuleReplacementDescription(
-                            suffix="score", target_module=Linear1D_Col, kwargs=dict(gather_output=True)
+                            suffix="score",
+                            target_module=Linear1D_Col,
+                            kwargs=dict(
+                                gather_output=True,
+                                fp8_communication=self.shard_config.fp8_communication,
+                            ),
                         )
                     ]
                 )

From 698c8b98046a7603d9c2842ae494fa4cffa81b5c Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Wed, 21 Aug 2024 03:58:21 +0000
Subject: [PATCH 051/167] fix

---
 colossalai/shardformer/modeling/llama.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index f18d757abba7..4e8f67407bc6 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -538,6 +538,7 @@ def forward(
 
         # sp: all-to-all comminucation when introducing sequence parallel
         if sp_mode == "all_to_all":
+            query_states = all_to_all_comm(query_states, sp_group, fp8_communication=shard_config.fp8_communication)
             key_states = all_to_all_comm(key_states, sp_group, fp8_communication=shard_config.fp8_communication)
             value_states = all_to_all_comm(value_states, sp_group, fp8_communication=shard_config.fp8_communication)
             bsz, q_len, _ = query_states.size()

From 8b8e28244123534fdbe8371535216eb4a08fa82a Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Wed, 21 Aug 2024 09:18:45 +0000
Subject: [PATCH 052/167] fix

---
 requirements/requirements-test.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt
index 93a3690fe1d3..3fcf53e1858e 100644
--- a/requirements/requirements-test.txt
+++ b/requirements/requirements-test.txt
@@ -9,7 +9,7 @@ torchx-nightly==2022.6.29 # torchrec 0.2.0 requires torchx-nightly. This package
 torchrec==0.2.0
 contexttimer
 einops
-triton==2.1.0
+triton
 requests==2.27.1 # downgrade to avoid huggingface error https://github.com/huggingface/transformers/issues/17611
 SentencePiece
 ninja

From eea37da6fa62c35b8ec35607fe9fdf1e14287df3 Mon Sep 17 00:00:00 2001
From: Wang Binluo <32676639+wangbluo@users.noreply.github.com>
Date: Thu, 22 Aug 2024 09:21:34 +0800
Subject: [PATCH 053/167] [fp8] Merge feature/fp8_comm to main branch of
 Colossalai (#6016)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add SimPO

* fix dataloader

* remove debug code

* add orpo

* fix style

* fix colossalai, transformers version

* fix colossalai, transformers version

* fix colossalai, transformers version

* fix torch colossalai version

* update transformers version

* [shardformer] DeepseekMoE support (#5871)

* [Feature] deepseek moe expert parallel implement

* [misc] fix typo, remove redundant file (#5867)

* [misc] fix typo

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [Feature] deepseek support & unit test

* [misc] remove debug code & useless print

* [misc] fix typos (#5872)

* [Feature] remove modeling file, use auto config. (#5884)

* [misc] fix typos

* [Feature] deepseek support via auto model, remove modeling file

* [misc] delete useless file

* [misc] fix typos

* [Deepseek] remove redundant code (#5888)

* [misc] fix typos

* [Feature] deepseek support via auto model, remove modeling file

* [misc] delete useless file

* [misc] fix typos

* [misc] remove redundant code

* [Feature/deepseek] resolve comment. (#5889)

* [misc] fix typos

* [Feature] deepseek support via auto model, remove modeling file

* [misc] delete useless file

* [misc] fix typos

* [misc] remove redundant code

* [misc] mv module replacement into if branch

* [misc] add some warning message and modify some code in unit test

* [misc] fix typos

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [Hoxfix] Fix CUDA_DEVICE_MAX_CONNECTIONS for comm overlap

Co-authored-by: Edenzzzz <wtan45@wisc.edu>

* [Feat] Diffusion Model(PixArtAlpha/StableDiffusion3) Support (#5838)

* Diffusion Model Inference support

* Stable Diffusion 3 Support

* pixartalpha support

* [HotFix] CI,import,requirements-test for #5838 (#5892)

* [Hot Fix] CI,import,requirements-test

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [Feature] Enable PP + SP for llama (#5868)

* fix cross-PP-stage position id length diff bug

* fix typo

* fix typo

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* use a one cross entropy func for all shardformer models

---------

Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [ShardFormer] Add Ulysses Sequence Parallelism support for Command-R, Qwen2 and ChatGLM (#5897)

* add benchmark for sft, dpo, simpo, orpo. Add benchmarking result. Support lora with gradient checkpoint

* fix style

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix eval

* hotfix citation

* [zero] support all-gather overlap (#5898)

* [zero] support all-gather overlap

* [zero] add overlap all-gather flag

* [misc] fix typo

* [zero] update api

* fix orpo cross entropy loss

* [Auto Parallel]: Speed up intra-op plan generation by 44% (#5446)

* Remove unnecessary calls to deepcopy

* Build DimSpec's difference dict only once

This change considerably speeds up construction speed of DimSpec objects. The difference_dict is the same for each DimSpec object, so a single copy of it is enough.

* Fix documentation of DimSpec's difference method

* [ShardFormer] fix qwen2 sp (#5903)

* [compatibility] support torch 2.2 (#5875)

* Support Pytorch 2.2.2

* keep build_on_pr file and update .compatibility

* fix object_to_tensor usage when torch>=2.3.0 (#5820)

* [misc] support torch2.3 (#5893)

* [misc] support torch2.3

* [devops] update compatibility ci

* [devops] update compatibility ci

* [devops] add debug

* [devops] add debug

* [devops] add debug

* [devops] add debug

* [devops] remove debug

* [devops] remove debug

* [release] update version (#5912)

* [plugin] support all-gather overlap for hybrid parallel (#5919)

* [plugin] fixed all-gather overlap support for hybrid parallel

* add kto

* fix style, add kto data sample

* [Examples] Add lazy init to OPT and GPT examples (#5924)

Co-authored-by: Edenzzzz <wtan45@wisc.edu>

* [ColossalChat] Hotfix for ColossalChat (#5910)

* add ignore and tiny llama

* fix path issue

* run style

* fix issue

* update bash

* add ignore and tiny llama

* fix path issue

* run style

* fix issue

* update bash

* fix ddp issue

* add Qwen 1.5 32B

* refactor tokenization

* [FIX BUG] UnboundLocalError: cannot access local variable 'default_conversation' where it is not associated with a value (#5931)

* cannot access local variable 'default_conversation' where it is not associated with a value

set default value for 'default_conversation'

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* fix test data

* refactor evaluation

* remove real data path

* remove real data path

* Add n_fused as an input from native_module (#5894)

* [FIX BUG] convert env param to int in (#5934)

* [Hotfix] Fix ZeRO typo #5936

Co-authored-by: Edenzzzz <wtan45@wisc.edu>

* [Feature] Add a switch to control whether the model checkpoint needs to be saved after each epoch ends (#5941)

* Add a switch to control whether the model checkpoint needs to be saved after each epoch ends

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* fix style

* fix style

* fix style

* [shardformer] hotfix attn mask (#5945)

* [shardformer] hotfix attn mask (#5947)

* [Feat] Distrifusion Acceleration Support for Diffusion Inference (#5895)

* Distrifusion Support source

* comp comm overlap optimization

* sd3 benchmark

* pixart distrifusion bug fix

* sd3 bug fix and benchmark

* generation bug fix

* naming fix

* add docstring, fix counter and shape error

* add reference

* readme and requirement

* [zero] hotfix update master params (#5951)

* [release] update version (#5952)

* [Chat] Fix lora (#5946)

* fix merging

* remove filepath

* fix style

* Update README.md (#5958)

* [hotfix] Remove unused plan section (#5957)

* remove readme

* fix readme

* update

* [test] add mixtral for sequence classification

* [test] add mixtral transformer test

* [moe] fix plugin

* [test] mixtra pp shard test

* [chore] handle non member group

* [zero] solve hang

* [test] pass mixtral shardformer test

* [moe] implement transit between non moe tp and ep

* [zero] solve hang

* [misc] solve booster hang by rename the variable

* solve hang when parallel mode = pp + dp

* [moe] implement submesh initialization

* [moe] add mixtral dp grad scaling when not all experts are activated

* [chore] manually revert unintended commit

* [chore] trivial fix

* [chore] arg pass & remove drop token

* [test] add mixtral modelling test

* [moe] implement tp

* [moe] test deepseek

* [moe] clean legacy code

* [Feature] MoE Ulysses Support (#5918)

* moe sp support

* moe sp bug solve

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [chore] minor fix

* [moe] init moe plugin comm setting with sp

* moe sp + ep bug fix

* [moe] finalize test (no pp)

* [moe] full test for deepseek and mixtral (pp + sp to fix)

* [chore] minor fix after rebase

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [chore] solve moe ckpt test failure and some other arg pass failure

* [moe] remove ops

* [test] fix test: test_zero1_2

* [bug] fix: somehow logger hangs the program

* [moe] deepseek moe sp support

* [test] add check

* [deepseek] replace attn (a workaround for bug in transformers)

* [misc] skip redunant test

* [misc] remove debug/print code

* [moe] refactor mesh assignment

* Revert "[moe] implement submesh initialization"

This reverts commit 2f9bce6686d1415a83d5726dc5ff02222c742582.

* [chore] change moe_pg_mesh to private

* [misc] remove incompatible test config

* [misc] fix ci failure: change default value to false in moe plugin

* [misc] remove useless condition

* [chore] docstring

* [moe] remove force_overlap_comm flag and add warning instead

* [doc] add MoeHybridParallelPlugin docstring

* [moe] solve dp axis issue

* [chore] remove redundant test case, print string & reduce test tokens

* [feat] Dist Loader for Eval (#5950)

* support auto distributed data loader

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* support auto distributed data loader

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix tp error

* remove unused parameters

* remove unused

* update inference

* update docs

* update inference

---------

Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [lora] lora support hybrid parallel plugin (#5956)

* lora support hybrid plugin

* fix

* fix

* fix

* fix

* Support overall loss, update KTO logging

* [Docs] clarify launch port

Co-authored-by: Edenzzzz <wtan45@wisc.edu>

* [Hotfix] README link (#5966)

* update ignore

* update readme

* run style

* update readme

* [Hotfix] Avoid fused RMSnorm import error without apex (#5985)

Co-authored-by: Edenzzzz <wtan45@wisc.edu>

* [Chat] fix readme (#5989)

* fix readme

* fix readme, tokenization fully tested

* fix readme, tokenization fully tested

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* fix sync condition (#6000)

* [plugin] add cast inputs option for zero (#6003)

* [pre-commit.ci] pre-commit autoupdate (#5995)

updates:
- [github.com/psf/black-pre-commit-mirror: 24.4.2 → 24.8.0](https://github.com/psf/black-pre-commit-mirror/compare/24.4.2...24.8.0)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [misc] Bypass the huggingface bug to solve the mask mismatch problem (#5991)

* [Feature] Zigzag Ring attention (#5905)

* halfway

* fix cross-PP-stage position id length diff bug

* fix typo

* fix typo

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* unified cross entropy func for all shardformer models

* remove redundant lines

* add basic ring attn; debug cross entropy

* fwd bwd logic complete

* fwd bwd logic complete; add experimental triton rescale

* precision tests passed

* precision tests passed

* fix typos and remove misc files

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add sp_mode to benchmark; fix varlen interface

* update softmax_lse shape by new interface

* change tester name

* remove buffer clone; support packed seq layout

* add varlen tests

* fix typo

* all tests passed

* add dkv_group; fix mask

* remove debug statements

---------

Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>

* [misc] update compatibility (#6008)

* [misc] update compatibility

* [misc] update requirements

* [devops] disable requirements cache

* [test] fix torch ddp test

* [test] fix rerun on address in use

* [test] fix lazy init

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix the merge

* fix the merge

* overlap kv comm with output rescale (#6017)

Co-authored-by: Edenzzzz <wtan45@wisc.edu>

* fix the merge

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix the merge

* fix

* fix

* fix the merge

* fix

* [misc] Use dist logger in plugins (#6011)

* use dist logger in plugins

* remove trash

* print on rank 0

---------

Co-authored-by: Edenzzzz <wtan45@wisc.edu>

* fix

* fix

* fix

* fix

* fix the merge

* fix

* fix

* fix

* fix

---------

Co-authored-by: YeAnbang <anbangy2@outlook.com>
Co-authored-by: Haze188 <haze188@qq.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Edenzzzz <wenxuan.tan@wisc.edu>
Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: Runyu Lu <77330637+LRY89757@users.noreply.github.com>
Co-authored-by: Guangyao Zhang <xjtu521@qq.com>
Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com>
Co-authored-by: Hongxin Liu <lhx0217@gmail.com>
Co-authored-by: Stephan Kö <stephankoe@users.noreply.github.com>
Co-authored-by: アマデウス <kurisusnowdeng@users.noreply.github.com>
Co-authored-by: Tong Li <tong.li352711588@gmail.com>
Co-authored-by: zhurunhua <1281592874@qq.com>
Co-authored-by: Insu Jang <insujang@umich.edu>
Co-authored-by: Gao, Ruiyuan <905370712@qq.com>
Co-authored-by: hxwang <wang1570@e.ntu.edu.sg>
Co-authored-by: Michelle <qianranma8@gmail.com>
Co-authored-by: root <root@notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9-0.notebook-8f919155-6035-47b4-9c6f-1be133b9e2c9.colossal-ai.svc.cluster.local>
---
 .compatibility                                |   1 +
 .cuda_ext.json                                |   4 +-
 .github/workflows/build_on_pr.yml             |   2 +-
 .github/workflows/build_on_schedule.yml       |   2 +-
 .pre-commit-config.yaml                       |   3 +-
 applications/ColossalChat/.gitignore          |   1 +
 applications/ColossalChat/README.md           |   2 +-
 .../coati/dataset/tokenization_utils.py       |  21 +-
 .../ColossalChat/coati/models/loss.py         |  16 +-
 .../ColossalChat/coati/models/utils.py        |   7 +-
 .../ColossalChat/coati/trainer/dpo.py         |   9 +
 .../ColossalChat/coati/trainer/kto.py         |  37 +-
 .../ColossalChat/coati/trainer/orpo.py        |  12 +
 .../ColossalChat/coati/trainer/ppo.py         |  12 +-
 .../ColossalChat/coati/trainer/sft.py         |  14 +-
 applications/ColossalChat/examples/README.md  |  29 +-
 .../examples/inference/inference.py           |   5 +-
 .../examples/training_scripts/train_dpo.py    |   8 +
 .../examples/training_scripts/train_kto.py    |   2 +
 .../examples/training_scripts/train_orpo.py   |   2 +
 .../examples/training_scripts/train_ppo.py    |   4 +-
 .../examples/training_scripts/train_sft.py    |   2 +
 applications/ColossalChat/requirements.txt    |   2 +-
 applications/ColossalChat/tests/test_train.sh |  36 +-
 applications/README.md                        |   2 +-
 colossalai/booster/booster.py                 |  18 +-
 colossalai/booster/plugin/gemini_plugin.py    |  26 +-
 .../booster/plugin/hybrid_parallel_plugin.py  |  81 +-
 .../booster/plugin/low_level_zero_plugin.py   |  26 +-
 .../plugin/moe_hybrid_parallel_plugin.py      |  25 +-
 colossalai/booster/plugin/torch_ddp_plugin.py |   2 +
 .../booster/plugin/torch_fsdp_plugin.py       |  15 +-
 .../hybrid_parallel_checkpoint_io.py          |   4 -
 colossalai/lazy/pretrained.py                 |   4 -
 .../moe/openmoe/model/openmoe_policy.py       |   2 +-
 .../legacy/nn/layer/parallel_1d/_operation.py |   3 +
 colossalai/logging/logger.py                  |   5 +-
 .../pipeline/schedule/interleaved_pp.py       |   1 -
 colossalai/pipeline/schedule/one_f_one_b.py   |   1 +
 colossalai/shardformer/layer/__init__.py      |   4 +-
 colossalai/shardformer/layer/_operation.py    |  36 +-
 colossalai/shardformer/layer/attn.py          | 916 +++++++++++++++++-
 colossalai/shardformer/layer/linear.py        |  17 +-
 colossalai/shardformer/layer/loss.py          | 167 +++-
 colossalai/shardformer/layer/normalization.py |  25 +-
 .../shardformer/layer/qkv_fused_linear.py     |   2 +-
 colossalai/shardformer/layer/utils.py         | 198 +++-
 colossalai/shardformer/modeling/chatglm2.py   |  20 +
 colossalai/shardformer/modeling/command.py    |   8 +-
 colossalai/shardformer/modeling/deepseek.py   |  10 +-
 colossalai/shardformer/modeling/llama.py      | 145 ++-
 colossalai/shardformer/modeling/mixtral.py    |   4 +-
 .../shardformer/policies/base_policy.py       |   1 +
 colossalai/shardformer/policies/command.py    |  31 +-
 colossalai/shardformer/policies/llama.py      |  52 +-
 colossalai/shardformer/policies/mistral.py    |   2 +-
 colossalai/shardformer/policies/mixtral.py    |  15 +-
 colossalai/shardformer/shard/shard_config.py  |  12 +-
 colossalai/testing/utils.py                   |   2 +-
 colossalai/zero/gemini/gemini_optimizer.py    |  12 +-
 docs/source/en/basics/launch_colossalai.md    |   7 +-
 .../zh-Hans/basics/launch_colossalai.md       |   6 +-
 examples/language/llama/benchmark.py          |  33 +-
 examples/language/opt/README.md               |   2 +-
 examples/language/performance_evaluator.py    |  24 +-
 examples/tutorial/opt/opt/README.md           |   2 +-
 .../flash_attention_dao_cuda.py               |   8 +-
 requirements/requirements-test.txt            |   2 +-
 requirements/requirements.txt                 |   2 +-
 tests/kit/model_zoo/__init__.py               |   4 +-
 tests/kit/model_zoo/transformers/command.py   |  12 +-
 tests/kit/model_zoo/transformers/llama.py     |  41 +-
 tests/kit/model_zoo/transformers/mistral.py   |   2 +-
 tests/kit/model_zoo/transformers/mixtral.py   |   2 +
 tests/kit/model_zoo/transformers/qwen2.py     |  12 +-
 .../test_plugin/test_3d_plugin.py             |   2 +-
 .../test_plugin/test_low_level_zero_plugin.py |   2 +-
 .../test_plugin/test_torch_ddp_plugin.py      |   2 +-
 .../test_gemini_checkpoint_io.py              |   2 +-
 .../test_gemini_torch_compability.py          |   2 +-
 ...st_hybrid_parallel_plugin_checkpoint_io.py |   2 +-
 .../test_low_level_zero_checkpoint_io.py      |   2 +-
 .../test_plugins_huggingface_compatibility.py |   2 +-
 tests/test_lazy/test_models.py                |  14 +-
 tests/test_lora/test_lora.py                  |   2 +-
 .../test_schedule/test_interleaved.py         |  17 +-
 .../test_schedule/test_oneF_oneB.py           |  17 +-
 .../test_shardformer/test_flash_attention.py  |   3 +
 .../test_layer/test_ring_attn.py              | 186 ++++
 tests/test_shardformer/test_model/_utils.py   |  27 +-
 .../test_model/test_shard_command.py          |   4 +-
 .../test_model/test_shard_llama.py            | 113 ++-
 92 files changed, 2222 insertions(+), 463 deletions(-)
 create mode 100644 tests/test_shardformer/test_layer/test_ring_attn.py

diff --git a/.compatibility b/.compatibility
index 4f808740bc02..62d19faffa9e 100644
--- a/.compatibility
+++ b/.compatibility
@@ -1,3 +1,4 @@
 2.1.0-12.1.0
 2.2.2-12.1.0
 2.3.0-12.1.0
+2.4.0-12.4.1
diff --git a/.cuda_ext.json b/.cuda_ext.json
index 8c9d5916ccd8..1e617755b01b 100644
--- a/.cuda_ext.json
+++ b/.cuda_ext.json
@@ -5,8 +5,8 @@
       "cuda_image": "hpcaitech/cuda-conda:12.1"
     },
     {
-      "torch_command": "pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118",
-      "cuda_image": "hpcaitech/cuda-conda:11.8"
+      "torch_command": "pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu124",
+      "cuda_image": "hpcaitech/cuda-conda:12.4"
     }
   ]
 }
diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
index 151454239afe..58cd8826809a 100644
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -141,7 +141,7 @@ jobs:
       - name: Install Colossal-AI
         run: |
           BUILD_EXT=1 pip install -v -e .
-          pip install -r requirements/requirements-test.txt
+          pip install --no-cache-dir -r requirements/requirements-test.txt
 
       - name: Store Colossal-AI Cache
         run: |
diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml
index fc6424503fbc..fc688a71bd92 100644
--- a/.github/workflows/build_on_schedule.yml
+++ b/.github/workflows/build_on_schedule.yml
@@ -57,7 +57,7 @@ jobs:
           [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
           BUILD_EXT=1 pip install -v -e .
           cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
-          pip install -r requirements/requirements-test.txt
+          pip install --no-cache-dir -r requirements/requirements-test.txt
 
       - name: Unit Testing
         if: steps.check-avai.outputs.avai == 'true'
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9088d0e1bb71..f7217a8f1e86 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -12,9 +12,10 @@ repos:
     hooks:
       - id: isort
         name: sort all imports (python)
+        args: ["--profile", "black"] # avoid conflict with black
 
   - repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 24.4.2
+    rev: 24.8.0
     hooks:
     - id: black
       name: black formatter
diff --git a/applications/ColossalChat/.gitignore b/applications/ColossalChat/.gitignore
index 757cbb5da051..7b361d38e6d0 100755
--- a/applications/ColossalChat/.gitignore
+++ b/applications/ColossalChat/.gitignore
@@ -151,6 +151,7 @@ examples/training_scripts/wandb
 examples/training_scripts/output
 
 examples/awesome-chatgpt-prompts/
+examples/inference/round.txt
 temp/
 
 # ColossalChat
diff --git a/applications/ColossalChat/README.md b/applications/ColossalChat/README.md
index de27ebaf6be1..3604fab103a2 100755
--- a/applications/ColossalChat/README.md
+++ b/applications/ColossalChat/README.md
@@ -121,7 +121,7 @@ cd $COLOSSAL_AI_ROOT
 BUILD_EXT=1 pip install .
 
 # Install ColossalChat
-cd $COLOSSAL_AI_ROOT/applications/Chat
+cd $COLOSSAL_AI_ROOT/applications/ColossalChat
 pip install .
 ```
 
diff --git a/applications/ColossalChat/coati/dataset/tokenization_utils.py b/applications/ColossalChat/coati/dataset/tokenization_utils.py
index 9eb2eba87bf2..020432b9ec3c 100755
--- a/applications/ColossalChat/coati/dataset/tokenization_utils.py
+++ b/applications/ColossalChat/coati/dataset/tokenization_utils.py
@@ -49,6 +49,10 @@ def tokenize_sft(
 
     messages = data_point["messages"]
     template = deepcopy(conversation_template)
+
+    if messages[0]["from"] == "system":
+        template.system_message = str(messages[0]["content"])
+        messages.pop(0)
     template.messages = []
     for idx, mess in enumerate(messages):
         if mess["from"] != template.roles[idx % 2]:
@@ -148,11 +152,14 @@ def tokenize_prompt(
     template = deepcopy(conversation_template)
     template.messages = []
 
+    if messages[0]["from"] == "system":
+        template.system_message = str(messages[0]["content"])
+        messages.pop(0)
+
     for idx, mess in enumerate(messages):
         if mess["from"] != template.roles[idx % 2]:
             raise ValueError(
-                f"Message should iterate between user and assistant and starts with a \
-                             line from the user. Got the following data:\n{messages}"
+                f"Message should iterate between user and assistant and starts with a line from the user. Got the following data:\n{messages}"
             )
         template.append_message(mess["from"], mess["content"])
 
@@ -162,7 +169,7 @@ def tokenize_prompt(
         template.messages = template.messages[:-1]
 
     # Prepare data
-    prompt = template.get_prompt(length=len(template.messages) - 1, add_generation_prompt=True)
+    prompt = template.get_prompt(length=len(template.messages), add_generation_prompt=True)
     tokenized = tokenizer([prompt], add_special_tokens=False)["input_ids"][0]
 
     if tokenizer.bos_token_id is not None:
@@ -225,6 +232,10 @@ def tokenize_rlhf(
     template = deepcopy(conversation_template)
     template.clear()
 
+    if context[0]["from"] == "system":
+        template.system_message = str(context[0]["content"])
+        context.pop(0)
+
     for idx, mess in enumerate(context):
         if mess["from"] != template.roles[idx % 2]:
             raise ValueError(
@@ -345,6 +356,10 @@ def tokenize_kto(
     template = deepcopy(conversation_template)
     template.clear()
 
+    if prompt[0]["from"] == "system":
+        template.system_message = str(prompt[0]["content"])
+        prompt.pop(0)
+
     if prompt[0].get("from", None) != "user":
         raise ValueError("conversation should start with user")
     if completion.get("from", None) != "assistant":
diff --git a/applications/ColossalChat/coati/models/loss.py b/applications/ColossalChat/coati/models/loss.py
index 840cca074c39..bd0bbd36b9bc 100755
--- a/applications/ColossalChat/coati/models/loss.py
+++ b/applications/ColossalChat/coati/models/loss.py
@@ -46,7 +46,10 @@ def forward(
         action_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         skip = False
-        ratio_ = ((log_probs - old_log_probs) * action_mask).exp()
+        if action_mask is None:
+            ratio_ = (log_probs - old_log_probs).exp()
+        else:
+            ratio_ = ((log_probs - old_log_probs) * action_mask).exp()
 
         # note that if dropout is disabled (recommanded), ratio will always be 1.
         if ratio_.mean() > self.skip_threshold:
@@ -56,7 +59,10 @@ def forward(
         surr1 = ratio * advantages
         surr2 = ratio.clamp(1 - self.clip_eps, 1 + self.clip_eps) * advantages
         loss = -torch.min(surr1, surr2)
-        loss = masked_mean(loss, action_mask)
+        if action_mask is not None:
+            loss = masked_mean(loss, action_mask)
+        else:
+            loss = loss.mean(dim=1)
         loss = loss.mean()
         return loss, skip, ratio_.max()
 
@@ -81,8 +87,10 @@ def forward(
         values_clipped = old_values + (values - old_values).clamp(-self.clip_eps, self.clip_eps)
         surr1 = (values_clipped - returns) ** 2
         surr2 = (values - returns) ** 2
-        loss = torch.max(surr1, surr2) / torch.sum(action_mask)
-        loss = torch.sum(loss * action_mask)
+        if action_mask is not None:
+            loss = torch.sum(torch.max(surr1, surr2) / torch.sum(action_mask) * action_mask)
+        else:
+            loss = torch.mean(torch.max(surr1, surr2))
         return 0.5 * loss
 
 
diff --git a/applications/ColossalChat/coati/models/utils.py b/applications/ColossalChat/coati/models/utils.py
index 8ed8d34010b2..c583f057a5ab 100755
--- a/applications/ColossalChat/coati/models/utils.py
+++ b/applications/ColossalChat/coati/models/utils.py
@@ -138,6 +138,7 @@ def disable_dropout(model: torch.nn.Module):
     Returns:
         None
     """
-    for module in model.modules():
-        if isinstance(module, torch.nn.Dropout):
-            module.p = 0.0
+    if model is not None:
+        for module in model.modules():
+            if isinstance(module, torch.nn.Dropout):
+                module.p = 0.0
diff --git a/applications/ColossalChat/coati/trainer/dpo.py b/applications/ColossalChat/coati/trainer/dpo.py
index c7ef2be8f6c4..24ddca6545c8 100755
--- a/applications/ColossalChat/coati/trainer/dpo.py
+++ b/applications/ColossalChat/coati/trainer/dpo.py
@@ -56,6 +56,7 @@ def __init__(
         beta: float = 0.1,
         gamma: float = 0.0,
         length_normalization: bool = False,
+        apply_loss_mask: bool = True,
         accumulation_steps: int = 1,
         start_epoch: int = 0,
         save_interval: int = 0,
@@ -67,6 +68,7 @@ def __init__(
         self.actor_scheduler = actor_lr_scheduler
         self.tokenizer = tokenizer
         self.actor_loss_fn = DpoLoss(beta, gamma)
+        self.apply_loss_mask = apply_loss_mask
         self.save_interval = save_interval
         self.coordinator = coordinator
         self.save_dir = save_dir
@@ -135,6 +137,10 @@ def _train(self, epoch: int):
                 batch["reject_attention_mask"],
                 batch["reject_loss_mask"],
             )
+            if not self.apply_loss_mask:
+                chosen_loss_mask = chosen_loss_mask.fill_(1.0)
+                reject_loss_mask = reject_loss_mask.fill_(1.0)
+
             batch_size = chosen_input_ids.size()[0]
 
             actor_all_logits = self.model(
@@ -284,6 +290,9 @@ def _eval(self, epoch: int):
                     batch["reject_attention_mask"],
                     batch["reject_loss_mask"],
                 )
+                if not self.apply_loss_mask:
+                    chosen_loss_mask = chosen_loss_mask.fill_(1.0)
+                    reject_loss_mask = reject_loss_mask.fill_(1.0)
 
                 batch_size = chosen_input_ids.size()[0]
 
diff --git a/applications/ColossalChat/coati/trainer/kto.py b/applications/ColossalChat/coati/trainer/kto.py
index 8ab0bc66bcf9..6462ba816686 100755
--- a/applications/ColossalChat/coati/trainer/kto.py
+++ b/applications/ColossalChat/coati/trainer/kto.py
@@ -6,7 +6,7 @@
 from typing import Any, Optional
 
 import torch
-import torch.distributed
+import torch.distributed as dist
 from coati.models.loss import KTOLoss
 from coati.models.utils import calc_masked_log_probs
 from coati.trainer.utils import all_reduce_mean
@@ -59,6 +59,7 @@ def __init__(
         beta: float = 0.1,
         desirable_weight: float = 1.0,
         undesirable_weight: float = 1.0,
+        apply_loss_mask: bool = True,
         accumulation_steps: int = 1,
         start_epoch: int = 0,
         save_interval: int = 0,
@@ -70,6 +71,7 @@ def __init__(
         self.actor_scheduler = actor_lr_scheduler
         self.tokenizer = tokenizer
         self.kto_loss = KTOLoss(beta=beta, desirable_weight=desirable_weight, undesirable_weight=undesirable_weight)
+        self.apply_loss_mask = apply_loss_mask
         self.save_interval = save_interval
         self.coordinator = coordinator
         self.save_dir = save_dir
@@ -134,6 +136,10 @@ def _train(self, epoch: int):
                 batch["kl_attention_mask"],
                 batch["kl_loss_mask"],
             )
+            if not self.apply_loss_mask:
+                loss_mask = loss_mask.fill_(1.0)
+                kl_loss_mask = kl_loss_mask.fill_(1.0)
+
             batch_size = input_ids.size()[0]
 
             # actor logits
@@ -182,8 +188,28 @@ def _train(self, epoch: int):
 
             # sync
             loss_mean = all_reduce_mean(tensor=loss)
-            chosen_rewards_mean = all_reduce_mean(tensor=chosen_rewards.mean())
-            rejected_rewards_mean = all_reduce_mean(tensor=rejected_rewards.mean())
+            chosen_reward_mean = chosen_rewards.mean()
+            chosen_rewards_list = [
+                torch.tensor(0, dtype=loss.dtype, device=loss.device) for _ in range(dist.get_world_size())
+            ]
+            dist.all_gather(chosen_rewards_list, chosen_reward_mean)
+            rejected_reward_mean = rejected_rewards.mean()
+            rejected_rewards_list = [
+                torch.tensor(0, dtype=loss.dtype, device=loss.device) for _ in range(dist.get_world_size())
+            ]
+            dist.all_gather(rejected_rewards_list, rejected_reward_mean)
+            chosen_rewards_list = [i for i in chosen_rewards_list if not i.isnan()]
+            rejected_rewards_list = [i for i in rejected_rewards_list if not i.isnan()]
+            chosen_rewards_mean = (
+                torch.stack(chosen_rewards_list).mean()
+                if len(chosen_rewards_list) > 0
+                else torch.tensor(torch.nan, dtype=loss.dtype, device=loss.device)
+            )
+            rejected_rewards_mean = (
+                torch.stack(rejected_rewards_list).mean()
+                if len(rejected_rewards_list) > 0
+                else torch.tensor(torch.nan, dtype=loss.dtype, device=loss.device)
+            )
             self.accumulative_meter.add("chosen_rewards", chosen_rewards_mean.to(torch.float16).mean().item())
             self.accumulative_meter.add("rejected_rewards", rejected_rewards_mean.to(torch.float16).mean().item())
             self.accumulative_meter.add("loss", loss_mean.to(torch.float16).detach().item())
@@ -256,6 +282,11 @@ def _eval(self, epoch: int):
                 batch["kl_attention_mask"],
                 batch["kl_loss_mask"],
             )
+
+            if not self.apply_loss_mask:
+                loss_mask = loss_mask.fill_(1.0)
+                kl_loss_mask = kl_loss_mask.fill_(1.0)
+
             batch_size = input_ids.size()[0]
 
             # actor logits
diff --git a/applications/ColossalChat/coati/trainer/orpo.py b/applications/ColossalChat/coati/trainer/orpo.py
index b039da4afc30..c2f75771cdff 100644
--- a/applications/ColossalChat/coati/trainer/orpo.py
+++ b/applications/ColossalChat/coati/trainer/orpo.py
@@ -52,6 +52,7 @@ def __init__(
         tokenizer: PreTrainedTokenizerBase,
         max_epochs: int = 1,
         lam: float = 0.1,
+        apply_loss_mask: bool = True,
         accumulation_steps: int = 1,
         start_epoch: int = 0,
         save_interval: int = 0,
@@ -67,6 +68,7 @@ def __init__(
         self.save_dir = save_dir
         self.num_train_step = 0
         self.lam = lam
+        self.apply_loss_mask = apply_loss_mask
         self.accumulation_steps = accumulation_steps
         self.device = get_current_device()
         self.accumulative_meter = AccumulativeMeanMeter()
@@ -130,6 +132,11 @@ def _train(self, epoch: int):
                 batch["reject_attention_mask"],
                 batch["reject_loss_mask"],
             )
+
+            if not self.apply_loss_mask:
+                chosen_loss_mask = chosen_loss_mask.fill_(1.0)
+                reject_loss_mask = reject_loss_mask.fill_(1.0)
+
             batch_size = chosen_input_ids.size()[0]
             actor_out = self.model(
                 input_ids=torch.cat([chosen_input_ids, reject_input_ids]),
@@ -263,6 +270,11 @@ def _eval(self, epoch: int):
                     batch["reject_attention_mask"],
                     batch["reject_loss_mask"],
                 )
+
+                if not self.apply_loss_mask:
+                    chosen_loss_mask = chosen_loss_mask.fill_(1.0)
+                    reject_loss_mask = reject_loss_mask.fill_(1.0)
+
                 batch_size = chosen_input_ids.size()[0]
                 actor_out = self.model(
                     input_ids=torch.cat([chosen_input_ids, reject_input_ids]),
diff --git a/applications/ColossalChat/coati/trainer/ppo.py b/applications/ColossalChat/coati/trainer/ppo.py
index 287767669516..63c813b39ef9 100755
--- a/applications/ColossalChat/coati/trainer/ppo.py
+++ b/applications/ColossalChat/coati/trainer/ppo.py
@@ -102,6 +102,7 @@ def __init__(
         sample_buffer: bool = False,
         dataloader_pin_memory: bool = True,
         offload_inference_models: bool = True,
+        apply_loss_mask: bool = True,
         accumulation_steps: int = 1,
         save_interval: int = 0,
         save_dir: str = None,
@@ -140,6 +141,7 @@ def __init__(
         self.actor_optim = actor_optim
         self.critic_optim = critic_optim
         self.save_interval = save_interval
+        self.apply_loss_mask = apply_loss_mask
         self.coordinator = coordinator
         self.actor_save_dir = os.path.join(save_dir, "actor")
         self.critic_save_dir = os.path.join(save_dir, "critic")
@@ -229,7 +231,10 @@ def _training_step(self, experience: Experience):
         action_log_probs = calc_action_log_probs(actor_logits, experience.sequences, num_actions)
 
         actor_loss, to_skip, max_ratio = self.actor_loss_fn(
-            action_log_probs, experience.action_log_probs, experience.advantages, action_mask=experience.action_mask
+            action_log_probs,
+            experience.action_log_probs,
+            experience.advantages,
+            action_mask=experience.action_mask if self.apply_loss_mask else None,
         )
         actor_loss = (1 - self.ptx_coef) * actor_loss
         if not to_skip:
@@ -249,7 +254,10 @@ def _training_step(self, experience: Experience):
             input_ids=experience.sequences, attention_mask=experience.attention_mask
         )  # [batch size, prompt_length + response_length]
         critic_loss = self.critic_loss_fn(
-            values[:, -num_actions:], experience.values, experience.advantages, action_mask=experience.action_mask
+            values[:, -num_actions:],
+            experience.values,
+            experience.advantages,
+            action_mask=experience.action_mask if self.apply_loss_mask else None,
         )
         critic_loss = critic_loss * self.vf_coef
         self.critic_booster.backward(loss=critic_loss, optimizer=self.critic_optim)
diff --git a/applications/ColossalChat/coati/trainer/sft.py b/applications/ColossalChat/coati/trainer/sft.py
index c09d61034984..d37676ada3e0 100755
--- a/applications/ColossalChat/coati/trainer/sft.py
+++ b/applications/ColossalChat/coati/trainer/sft.py
@@ -41,6 +41,7 @@ def __init__(
         lr_scheduler: _LRScheduler,
         max_epochs: int = 2,
         accumulation_steps: int = 8,
+        apply_loss_mask: bool = True,
         start_epoch=0,
         save_interval: int = None,
         save_dir: str = None,
@@ -55,6 +56,7 @@ def __init__(
         self.coordinator = coordinator
         self.num_train_step = 0
         self.num_eval_step = 0
+        self.apply_loss_mask = apply_loss_mask
         self.accumulative_meter = AccumulativeMeanMeter()
 
     def _before_fit(
@@ -100,7 +102,11 @@ def _train(self, epoch: int):
         for i, batch in enumerate(self.train_dataloader):
             batch = to_device(batch, torch.cuda.current_device())
             batch_size = batch["input_ids"].size(0)
-            outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"])
+            outputs = self.model(
+                batch["input_ids"],
+                attention_mask=batch["attention_mask"],
+                labels=batch["labels"] if self.apply_loss_mask else batch["input_ids"],
+            )
             loss = outputs.loss
 
             self.booster.backward(loss=loss, optimizer=self.optimizer)
@@ -158,7 +164,11 @@ def _eval(self, epoch: int):
             )
             for batch in self.eval_dataloader:
                 batch = to_device(batch, torch.cuda.current_device())
-                outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"])
+                outputs = self.model(
+                    batch["input_ids"],
+                    attention_mask=batch["attention_mask"],
+                    labels=batch["labels"] if self.apply_loss_mask else batch["input_ids"],
+                )
                 loss_mean = all_reduce_mean(tensor=outputs.loss)
                 self.accumulative_meter.add("loss", loss_mean.item(), count_update=batch["input_ids"].size(0))
                 step_bar.update()
diff --git a/applications/ColossalChat/examples/README.md b/applications/ColossalChat/examples/README.md
index b749f197ed21..fec7bc061270 100755
--- a/applications/ColossalChat/examples/README.md
+++ b/applications/ColossalChat/examples/README.md
@@ -387,6 +387,7 @@ colossalai run --nproc_per_node 4 --master_port 28534 --hostfile ./hostfile trai
 - save_dir: path to store the model checkpoints.
 - max_length: input will be padded/truncated to max_length before feeding to the model.
 - max_epochs: number of epochs to train.
+- disable_loss_mask: whether to use the loss mask to mask the loss or not. For example, in SFT, if the loss mask is disabled, the model will compute the loss across all tokens in the sequence, if the loss mask is applied, only tokens correspond to the assistant responses will contribute to the final loss.
 - batch_size: training batch size.
 - mixed_precision: precision to use in training. Support 'fp16' and 'bf16'. Note that some devices may not support the 'bf16' option, please refer to [Nvidia](https://developer.nvidia.com/) to check compatibility.
 - save_interval: save the model weights as well as optimizer/scheduler states every save_interval steps/episodes.
@@ -461,26 +462,24 @@ Stage1 is supervised instructs fine-tuning (SFT). This step is a crucial part of
 
 
 #### Step 1: Data Collection
-The first step in Stage 1 is to collect a dataset of human demonstrations of the following format.
+The first step in Stage 1 is to collect a dataset of human demonstrations of the following JSONL format.
 
 
 ```json
-[
-    {"messages":
-      [
-        {
-          "from": "user",
-          "content": "what are some pranks with a pen i can do?"
-        },
-        {
-          "from": "assistant",
-          "content": "Are you looking for practical joke ideas?"
-        },
-        ...
-      ]
+{"messages":
+  [
+    {
+      "from": "user",
+      "content": "what are some pranks with a pen i can do?"
+    },
+    {
+      "from": "assistant",
+      "content": "Are you looking for practical joke ideas?"
     },
     ...
-]
+  ]
+},
+...
 ```
 
 
diff --git a/applications/ColossalChat/examples/inference/inference.py b/applications/ColossalChat/examples/inference/inference.py
index 103bd8d95016..32310cce93fd 100755
--- a/applications/ColossalChat/examples/inference/inference.py
+++ b/applications/ColossalChat/examples/inference/inference.py
@@ -53,8 +53,8 @@ def load_model_and_tokenizer(model_path, tokenizer_path, device="cuda", **kwargs
         tuple: A tuple containing the loaded model and tokenizer.
     """
 
-    model = AutoModelForCausalLM.from_pretrained(model_path, **kwargs)
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+    model = AutoModelForCausalLM.from_pretrained(model_path, **kwargs, trust_remote_code=True).to(torch.bfloat16)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
     tokenizer.pad_token = tokenizer.eos_token
     model.to(device)
 
@@ -151,7 +151,6 @@ def main(args):
         chat_io.prompt_for_output("assistant")
 
         prompt = conv.get_prompt(add_generation_prompt=True)
-        print(prompt + "<end_of_prompt>")
         input_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)["input_ids"].to(
             torch.cuda.current_device()
         )
diff --git a/applications/ColossalChat/examples/training_scripts/train_dpo.py b/applications/ColossalChat/examples/training_scripts/train_dpo.py
index 44131f572445..092fc6f437b3 100755
--- a/applications/ColossalChat/examples/training_scripts/train_dpo.py
+++ b/applications/ColossalChat/examples/training_scripts/train_dpo.py
@@ -278,6 +278,10 @@ def train(args):
         beta=args.beta,
         gamma=args.gamma,
         length_normalization=args.length_normalization,
+<<<<<<< HEAD
+=======
+        apply_loss_mask=not args.disable_loss_mask,
+>>>>>>> main
     )
 
     trainer.fit(
@@ -346,6 +350,10 @@ def train(args):
         default=False,
         help="Disable the reference model (enabled by default)",
     )
+<<<<<<< HEAD
+=======
+    parser.add_argument("--disable_loss_mask", default=False, action="store_true")
+>>>>>>> main
     parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision")
     parser.add_argument("--lora_config", type=str, default=None, help="low-rank adaptation config file path")
     parser.add_argument("--save_interval", type=int, default=1000, help="number of step between two checkpoints")
diff --git a/applications/ColossalChat/examples/training_scripts/train_kto.py b/applications/ColossalChat/examples/training_scripts/train_kto.py
index d063b82bb214..598fd8062fcf 100755
--- a/applications/ColossalChat/examples/training_scripts/train_kto.py
+++ b/applications/ColossalChat/examples/training_scripts/train_kto.py
@@ -297,6 +297,7 @@ def train(args):
         beta=args.beta,
         desirable_weight=args.desirable_weight,
         undesirable_weight=args.undesirable_weight,
+        apply_loss_mask=not args.disable_loss_mask,
     )
 
     trainer.fit(
@@ -341,6 +342,7 @@ def train(args):
     parser.add_argument("--beta", type=float, default=0.1, help="beta in KTO loss")
     parser.add_argument("--desirable_weight", type=float, default=1.0, help="desirable_weight in KTO loss")
     parser.add_argument("--undesirable_weight", type=float, default=1.0, help="undesirable_weight in KTO loss")
+    parser.add_argument("--disable_loss_mask", default=False, action="store_true")
     parser.add_argument("--enable_sequence_parallelism", default=False, action="store_true")
     parser.add_argument("--zero_stage", type=int, default=0, help="Zero stage", choices=[0, 1, 2])
     parser.add_argument("--zero_cpu_offload", default=False, action="store_true")
diff --git a/applications/ColossalChat/examples/training_scripts/train_orpo.py b/applications/ColossalChat/examples/training_scripts/train_orpo.py
index f06524507d5f..87860f7ea023 100755
--- a/applications/ColossalChat/examples/training_scripts/train_orpo.py
+++ b/applications/ColossalChat/examples/training_scripts/train_orpo.py
@@ -259,6 +259,7 @@ def train(args):
         save_dir=args.save_dir,
         coordinator=coordinator,
         lam=args.lam,
+        apply_loss_mask=not args.disable_loss_mask,
     )
 
     trainer.fit(
@@ -301,6 +302,7 @@ def train(args):
     parser.add_argument("--pp", type=int, default=1)
     parser.add_argument("--sp", type=int, default=1)
     parser.add_argument("--lam", type=float, default=0.1, help="lambda in ORPO loss")
+    parser.add_argument("--disable_loss_mask", default=False, action="store_true")
     parser.add_argument("--enable_sequence_parallelism", default=False, action="store_true")
     parser.add_argument("--zero_stage", type=int, default=0, help="Zero stage", choices=[0, 1, 2])
     parser.add_argument("--zero_cpu_offload", default=False, action="store_true")
diff --git a/applications/ColossalChat/examples/training_scripts/train_ppo.py b/applications/ColossalChat/examples/training_scripts/train_ppo.py
index 333be9963c06..a0a10e239725 100755
--- a/applications/ColossalChat/examples/training_scripts/train_ppo.py
+++ b/applications/ColossalChat/examples/training_scripts/train_ppo.py
@@ -411,6 +411,7 @@ def train(args):
         use_cache=True,
         do_sample=True,
         temperature=0.7,
+        apply_loss_mask=not args.disable_loss_mask,
         accumulation_steps=args.accumulation_steps,
         save_dir=args.save_path,
         save_interval=args.save_interval,
@@ -498,9 +499,10 @@ def train(args):
     parser.add_argument("--critic_lr", type=float, default=9e-6)
     parser.add_argument("--kl_coef", type=float, default=0.1)
     parser.add_argument("--ptx_coef", type=float, default=0.0)
+    parser.add_argument("--disable_loss_mask", default=False, action="store_true")
     parser.add_argument("--max_length", type=int, default=2048)
     parser.add_argument("--max_seq_len", type=int, default=256)
-    parser.add_argument("--log_dir", default="logs", type=str)
+    parser.add_argument("--log_dir", default=None, type=str)
     parser.add_argument("--use_wandb", default=False, action="store_true")
     parser.add_argument("--grad_checkpoint", default=False, action="store_true")
     parser.add_argument("--use_flash_attn", default=False, action="store_true")
diff --git a/applications/ColossalChat/examples/training_scripts/train_sft.py b/applications/ColossalChat/examples/training_scripts/train_sft.py
index 6007a8599277..c4ef3b783d4d 100755
--- a/applications/ColossalChat/examples/training_scripts/train_sft.py
+++ b/applications/ColossalChat/examples/training_scripts/train_sft.py
@@ -272,6 +272,7 @@ def train(args):
         lr_scheduler=lr_scheduler,
         max_epochs=args.max_epochs,
         accumulation_steps=args.accumulation_steps,
+        apply_loss_mask=not args.disable_loss_mask,
         start_epoch=start_epoch,
         save_interval=args.save_interval,
         save_dir=args.save_path,
@@ -317,6 +318,7 @@ def train(args):
     parser.add_argument("--tp", type=int, default=1)
     parser.add_argument("--pp", type=int, default=1)
     parser.add_argument("--sp", type=int, default=1)
+    parser.add_argument("--disable_loss_mask", default=False, action="store_true")
     parser.add_argument("--enable_sequence_parallelism", default=False, action="store_true")
     parser.add_argument("--zero_stage", type=int, default=0, help="Zero stage", choices=[0, 1, 2])
     parser.add_argument("--zero_cpu_offload", default=False, action="store_true")
diff --git a/applications/ColossalChat/requirements.txt b/applications/ColossalChat/requirements.txt
index 2188de12f2be..ac40ae821d0a 100755
--- a/applications/ColossalChat/requirements.txt
+++ b/applications/ColossalChat/requirements.txt
@@ -2,7 +2,7 @@ transformers==4.39.3
 tqdm
 datasets==2.14.7
 loralib
-colossalai==0.4.0
+colossalai>=0.4.0
 torch>=2.1.0
 langchain
 tokenizers
diff --git a/applications/ColossalChat/tests/test_train.sh b/applications/ColossalChat/tests/test_train.sh
index c26b25c837e6..69036de635c9 100755
--- a/applications/ColossalChat/tests/test_train.sh
+++ b/applications/ColossalChat/tests/test_train.sh
@@ -15,7 +15,7 @@ set_n_least_used_CUDA_VISIBLE_DEVICES() {
     echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
 }
 
-set_n_least_used_CUDA_VISIBLE_DEVICES 4
+set_n_least_used_CUDA_VISIBLE_DEVICES 2
 
 set -xu
 
@@ -119,11 +119,11 @@ for lora_rank in ${LORA_RANK[@]}; do
                 lora_config=""
             fi
             if [[ $plugin == "3d" ]]; then
-                tp='4'
+                tp='2'
                 bs='8'
             fi
             if [[ $plugin == "tp_zero2" ]]; then
-                tp='4'
+                tp='2'
                 bs='8'
                 zero_stage='2'
                 plugin='3d'
@@ -136,13 +136,13 @@ for lora_rank in ${LORA_RANK[@]}; do
             fi
             if [[ $plugin == "pp" ]]; then
                 bs='8'
-                pp='4'
+                pp='2'
                 plugin='3d'
             fi
             if [[ $plugin == "sp_split_gather" ]]; then
                 enable_sequence_parallelism='--enable_sequence_parallelism'
                 sp_mode='split_gather'
-                tp='4'
+                tp='2'
                 sp='1'
                 bs='8'
                 plugin='3d'
@@ -150,7 +150,7 @@ for lora_rank in ${LORA_RANK[@]}; do
             if [[ $plugin == "sp_ring" ]]; then
                 enable_sequence_parallelism='--enable_sequence_parallelism'
                 sp_mode='ring'
-                tp='4'
+                tp='2'
                 sp='1'
                 bs='8'
                 plugin='3d'
@@ -159,7 +159,7 @@ for lora_rank in ${LORA_RANK[@]}; do
                 enable_sequence_parallelism='--enable_sequence_parallelism'
                 sp_mode='all_to_all'
                 tp='1'
-                sp='4'
+                sp='2'
                 bs='8'
                 plugin='3d'
             fi
@@ -175,7 +175,7 @@ for lora_rank in ${LORA_RANK[@]}; do
                 for split in $(seq -f "%05g" 0 0); do
                     dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_sft/arrow/part-$split")
                 done
-                colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \
+                colossalai run --nproc_per_node 2 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_sft.py \
                     --pretrain $pretrain \
                     --tokenizer_dir $tokenizer_dir \
                     --dataset ${dataset[@]} \
@@ -242,7 +242,7 @@ for lora_rank in ${LORA_RANK[@]}; do
                 lora_config=""
             fi
             if [[ $plugin == "3d" ]]; then
-                tp='4'
+                tp='2'
                 bs='8'
             fi
             grad_accu='2'
@@ -256,7 +256,7 @@ for lora_rank in ${LORA_RANK[@]}; do
                 for split in $(seq -f "%05g" 0 0); do
                     dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_preference/arrow/part-$split")
                 done
-                colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_rm.py \
+                colossalai run --nproc_per_node 2 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_rm.py \
                     --pretrain $pretrain \
                     --tokenizer_dir $tokenizer_dir \
                     --dataset ${dataset[@]} \
@@ -325,7 +325,7 @@ for lora_rank in ${LORA_RANK[@]}; do
                 lora_config=""
             fi
             if [[ $plugin == "3d" ]]; then
-                tp='4'
+                tp='2'
                 bs='16'
                 ebs='32'
             fi
@@ -350,7 +350,7 @@ for lora_rank in ${LORA_RANK[@]}; do
                 for split in $(seq -f "%05g" 0 0); do
                     ptx_dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_sft/arrow/part-$split")
                 done
-                colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_ppo.py \
+                colossalai run --nproc_per_node 2 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_ppo.py \
                     --pretrain $pretrain \
                     --rm_pretrain $pretrain \
                     --tokenizer_dir $tokenizer_dir \
@@ -417,7 +417,7 @@ for lora_rank in ${LORA_RANK[@]}; do
             tp='1'
             bs='2'
             if [[ $plugin == "3d" ]]; then
-                tp='4'
+                tp='2'
                 bs='8'
             fi
             if [[ $plugin == "zero2" ]]; then
@@ -442,7 +442,7 @@ for lora_rank in ${LORA_RANK[@]}; do
                 for split in $(seq -f "%05g" 0 0); do
                     dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_preference/arrow/part-$split")
                 done
-                colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_dpo.py \
+                colossalai run --nproc_per_node 2 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_dpo.py \
                     --pretrain $pretrain \
                     --tokenizer_dir $tokenizer_dir \
                     --dataset ${dataset[@]} \
@@ -500,7 +500,7 @@ for lora_rank in ${LORA_RANK[@]}; do
             tp='1'
             bs='2'
             if [[ $plugin == "3d" ]]; then
-                tp='4'
+                tp='2'
                 bs='8'
             fi
             if [[ $plugin == "zero2" ]]; then
@@ -525,7 +525,7 @@ for lora_rank in ${LORA_RANK[@]}; do
                 for split in $(seq -f "%05g" 0 0); do
                     dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_preference/arrow/part-$split")
                 done
-                colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_orpo.py \
+                colossalai run --nproc_per_node 2 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_orpo.py \
                     --pretrain $pretrain \
                     --tokenizer_dir $tokenizer_dir \
                     --dataset ${dataset[@]} \
@@ -583,7 +583,7 @@ for lora_rank in ${LORA_RANK[@]}; do
             tp='1'
             bs='2'
             if [[ $plugin == "3d" ]]; then
-                tp='4'
+                tp='2'
                 bs='8'
             fi
             if [[ $plugin == "zero2" ]]; then
@@ -608,7 +608,7 @@ for lora_rank in ${LORA_RANK[@]}; do
                 for split in $(seq -f "%05g" 0 0); do
                     dataset+=("$TEMP_DIR/rlhf_data/tokenized_${model}_kto/arrow/part-$split")
                 done
-                colossalai run --nproc_per_node 4 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_kto.py \
+                colossalai run --nproc_per_node 2 --master_port 31332 $EXAMPLES_DIR/training_scripts/train_kto.py \
                     --pretrain $pretrain \
                     --tokenizer_dir $tokenizer_dir \
                     --dataset ${dataset[@]} \
diff --git a/applications/README.md b/applications/README.md
index 5b8b5e5012aa..9957300ae7f0 100644
--- a/applications/README.md
+++ b/applications/README.md
@@ -14,9 +14,9 @@ This directory contains the applications that are powered by Colossal-AI.
 The list of applications include:
 
 - [X] [Open-Sora](https://github.com/hpcaitech/Open-Sora): Revealing Complete Model Parameters, Training Details, and Everything for Sora-like Video Generation Models
+- [X] [ColossalChat](./ColossalChat/): Replication of ChatGPT with RLHF.
 - [X] [Colossal-LLaMA](./Colossal-LLaMA/): Continual Pre-training and Supervisied Fine-tuning of LLaMA2 / LLaMA3.
 - [X] [ColossalEval](./ColossalEval): Evaluation Pipeline for LLMs.
-- [X] [ColossalChat](./Chat/README.md): Replication of ChatGPT with RLHF.
 - [X] [FastFold](https://github.com/hpcaitech/FastFold): Optimizing AlphaFold (Biomedicine) Training and Inference on GPU Clusters.
 - [X] [ColossalQA](./ColossalQA/README.md): Document Retrieval Conversation System
 - [X] [SwiftInfer](https://github.com/hpcaitech/SwiftInfer): Breaks the Length Limit of LLM Inference for Multi-Round Conversations
diff --git a/colossalai/booster/booster.py b/colossalai/booster/booster.py
index 56d8a0935f10..8047d90f7a69 100644
--- a/colossalai/booster/booster.py
+++ b/colossalai/booster/booster.py
@@ -1,4 +1,3 @@
-import warnings
 from contextlib import contextmanager
 from typing import Any, Callable, Dict, Iterator, List, Optional, Union
 
@@ -8,6 +7,8 @@
 from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
 from torch.utils.data import DataLoader
 
+from colossalai.logging import get_dist_logger
+
 SUPPORT_PEFT = False
 try:
     import peft
@@ -81,12 +82,15 @@ def __init__(
                 plugin, Plugin
             ), f"Expected the argument plugin to be an instance of Plugin, but got {type(plugin)}."
         self.plugin = plugin
+        self.logger = get_dist_logger()
 
         # set accelerator
         if self.plugin and self.plugin.control_device():
             self.accelerator = None
             if device is not None:
-                warnings.warn("The plugin will control the accelerator, so the device argument will be ignored.")
+                self.logger.warning(
+                    "The plugin will control the accelerator," "so the device argument will be ignored.", ranks=[0]
+                )
         else:
             device = device or "cuda"
             self.accelerator = Accelerator(device)
@@ -94,7 +98,10 @@ def __init__(
         # set precision
         if self.plugin and self.plugin.control_precision():
             if mixed_precision is not None:
-                warnings.warn("The plugin will control the precision, so the mixed_precision argument will be ignored.")
+                self.logger.warning(
+                    "The plugin will control the precision," "so the mixed_precision argument will be ignored.",
+                    ranks=[0],
+                )
             self.mixed_precision = None
         elif mixed_precision is None:
             self.mixed_precision = None
@@ -267,8 +274,9 @@ def enable_lora(
             ), "Please provide pretrained directory path if not passing in lora configuration."
         if quantize is True:
             if bnb_quantization_config is not None:
-                warnings.warn(
-                    "User defined BnbQuantizationConfig is not fully tested in ColossalAI. Use it at your own risk."
+                self.logger.warning(
+                    "User defined BnbQuantizationConfig is not fully tested in ColossalAI. Use it at your own risk.",
+                    ranks=[0],
                 )
             else:
                 bnb_quantization_config = BnbQuantizationConfig(
diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py
index d0d3275cfab9..6a5d0c1613df 100644
--- a/colossalai/booster/plugin/gemini_plugin.py
+++ b/colossalai/booster/plugin/gemini_plugin.py
@@ -1,5 +1,4 @@
 import gc
-import logging
 import os
 import random
 from pathlib import Path
@@ -27,6 +26,7 @@
 )
 from colossalai.cluster import DistCoordinator, ProcessGroupMesh
 from colossalai.interface import ModelWrapper, OptimizerWrapper
+from colossalai.logging import get_dist_logger
 from colossalai.shardformer import ShardConfig, ShardFormer
 from colossalai.zero import GeminiDDP, GeminiOptimizer
 from colossalai.zero.gemini.memory_tracer import MemStats
@@ -63,6 +63,7 @@ class GeminiCheckpointIO(GeneralCheckpointIO):
     def __init__(self) -> None:
         super().__init__()
         self.coordinator = DistCoordinator()
+        self.logger = get_dist_logger()
 
     def save_unsharded_model(self, model: GeminiDDP, checkpoint: str, gather_dtensor: bool, use_safetensors: bool):
         """
@@ -118,7 +119,7 @@ def save_sharded_model(
         """
         assert isinstance(model, GeminiDDP), "Please boost the model before saving!"
         if os.path.isfile(checkpoint_path):
-            logging.error(f"Provided path ({checkpoint_path}) should be a directory, not a file")
+            self.logger.error(f"Provided path ({checkpoint_path}) should be a directory, not a file", ranks=[0])
             return
 
         Path(checkpoint_path).mkdir(parents=True, exist_ok=True)
@@ -143,10 +144,11 @@ def save_sharded_model(
             index_file.append_meta_data("total_size", total_size)
             index_file.write_index_file(save_index_file)
             save_config_file(model.unwrap(), checkpoint_path)
-            logging.info(
+            self.logger.info(
                 f"The model is split into checkpoint shards. "
                 f"You can find where each parameters has been saved in the "
-                f"index located at {save_index_file}."
+                f"index located at {save_index_file}.",
+                ranks=[0],
             )
 
     def load_sharded_model(
@@ -168,7 +170,7 @@ def save_sharded_optimizer(
         assert isinstance(optimizer, GeminiOptimizer), "Please boost the optimizer before saving!"
 
         if os.path.isfile(checkpoint):
-            logging.error(f"Provided path ({checkpoint}) should be a directory, not a file")
+            self.logger.error(f"Provided path ({checkpoint}) should be a directory, not a file", ranks=[0])
             return
 
         Path(checkpoint).mkdir(parents=True, exist_ok=True)
@@ -201,10 +203,11 @@ def save_sharded_optimizer(
         if self.coordinator.is_master():
             index_file.append_meta_data("total_size", total_size)
             index_file.write_index_file(save_index_file)
-            logging.info(
+            self.logger.info(
                 f"The optimizer is going to be split to checkpoint shards. "
                 f"You can find where each parameters has been saved in the "
-                f"index located at {save_index_file}."
+                f"index located at {save_index_file}.",
+                ranks=[0],
             )
 
     def load_sharded_optimizer(self, optimizer: GeminiOptimizer, checkpoint_index_file: Path, prefix: str):
@@ -214,7 +217,7 @@ def load_sharded_optimizer(self, optimizer: GeminiOptimizer, checkpoint_index_fi
         """
         assert isinstance(optimizer, GeminiOptimizer), "Please boost the optimizer before loading!"
         if not os.path.isfile(checkpoint_index_file):
-            logging.error(f"Provided path ({checkpoint_index_file}) should be a file")
+            self.logger.error(f"Provided path ({checkpoint_index_file}) should be a file", ranks=[0])
 
         assert isinstance(optimizer, GeminiOptimizer)
 
@@ -371,9 +374,12 @@ def __init__(
         assert precision in SUPPORTED_PRECISION, f"precision {precision} is not supported"
         if get_accelerator().name == "npu":
             assert placement_policy == "static", "NPU only supports static placement policy"
+
+        self.logger = get_dist_logger()
         if enable_async_reduce and not pin_memory:
-            logging.warning(
-                f"enable_async_reduce sets pin_memory=True to achieve best performance, which is not implicitly set."
+            self.logger.warning(
+                f"enable_async_reduce sets pin_memory=True to achieve best performance, which is not implicitly set.",
+                ranks=[0],
             )
             pin_memory = True
         self.gemini_config = dict(
diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index 9c0ee9e501ae..ae1fbc7719a0 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -1,6 +1,5 @@
 import ctypes
 import random
-import warnings
 from collections import defaultdict
 from contextlib import contextmanager, nullcontext
 from copy import deepcopy
@@ -27,13 +26,14 @@
 from colossalai.cluster import ProcessGroupMesh
 from colossalai.interface import AMPModelMixin, ModelWrapper, OptimizerWrapper
 from colossalai.interface.optimizer import DistributedOptim
+from colossalai.logging import get_dist_logger
 from colossalai.nn.optimizer import DistGaloreAwamW, cast_to_distributed
 from colossalai.pipeline.schedule import InterleavedSchedule, OneForwardOneBackwardSchedule
 from colossalai.pipeline.stage_manager import PipelineStageManager
 from colossalai.quantization import BnbQuantizationConfig, quantize_model
 from colossalai.quantization.fp8_hook import FP8Hook
 from colossalai.shardformer import GradientCheckpointConfig, ShardConfig, ShardFormer
-from colossalai.shardformer.layer.utils import SeqParallelUtils
+from colossalai.shardformer.layer.utils import SeqParallelUtils, is_share_sp_tp
 from colossalai.shardformer.policies.base_policy import Policy
 from colossalai.tensor.colo_parameter import ColoParameter
 from colossalai.tensor.d_tensor.api import is_distributed_tensor
@@ -43,7 +43,7 @@
 
 from .pp_plugin_base import PipelinePluginBase
 
-SUPPORT_SP_MODE = ["split_gather", "ring", "all_to_all"]
+SUPPORT_SP_MODE = ["split_gather", "ring", "all_to_all", "ring_attn"]
 
 PRECISION_TORCH_TYPE = {"fp16": torch.float16, "fp32": torch.float32, "bf16": torch.bfloat16}
 
@@ -74,7 +74,7 @@ def __init__(
         self.dp_group = dp_group
         self.tp_group = tp_group
         self.sp_group = sp_group
-        self.use_dpp = use_ddp
+        self.use_ddp = use_ddp
         self.require_grad_sync = True
         self.overlap_allgather = overlap_allgather
         self.use_fp8 = use_fp8
@@ -116,11 +116,10 @@ def __init__(
 
         super().__init__(module)
         self.op_hooks = []
-        if overlap_allgather:
-            self.op_hooks.append(ZeroOpHook())
         if use_fp8:
             self.op_hooks.append(FP8Hook())
-        if overlap_allgather or use_fp8:
+        if overlap_allgather:
+            self.op_hook = ZeroOpHook()
             for p in module.parameters():
                 if p.requires_grad and type(p) is not ColoParameter:
                     p.__class__ = ColoParameter
@@ -146,8 +145,8 @@ def no_sync(self):
         # Disable automatic gradient synchronization.
         self.require_grad_sync = False
         try:
-            if self.use_dpp:
-                # If using data parallel processing (use_dpp), disable synchronization too.
+            if self.use_ddp:
+                # If using data parallel processing (use_ddp), disable synchronization too.
                 with self.module.no_sync():
                     yield
             else:
@@ -195,7 +194,7 @@ def sync_sp_grads(self, grads: Optional[List[torch.Tensor]] = None):
         """
 
         if self.shard_config.enable_sequence_parallelism:
-            if self.shard_config.sequence_parallelism_mode == "all_to_all":
+            if self.shard_config.sequence_parallelism_mode in ["all_to_all", "ring_attn"]:
                 return
 
             if self.shard_config.sequence_parallelism_mode in ["split_gather", "ring"]:
@@ -980,8 +979,11 @@ class HybridParallelPlugin(PipelinePluginBase):
         gradient_checkpoint_config (GradientCheckpointConfig, optional): Configuration for gradient checkpointing. Defaults to None.
         enable_metadata_cache (bool, optional): Whether to enable metadata cache for pipeline parallelism. Defaults to True.
         make_vocab_size_divisible_by (int, optional): it's used when padding the vocabulary size, to make it choose an faster kenel. Default to 64.
-        overlap_p2p (bool, optional): Whether to overlap the p2p communication in pipeline parallelism.
         fp8_communication (bool, optional): Whether to enable fp8 communication in model parallelism
+        overlap_p2p (bool, optional): Whether to overlap the p2p communication in pipeline parallelism
+        inner_ring_size (int, optional): The inner ring size of 2D Ring Attention when sp mode is "ring_attn".
+            It's advisable to not tune this (especially in single-node settings) and let it be heuristically set based on topology by default.
+
     """
 
     def __init__(
@@ -1031,8 +1033,10 @@ def __init__(
         overlap_allgather: bool = False,
         fp8_communication: bool = False,
         use_fp8: bool = False,
+        inner_ring_size: int = None,
     ) -> None:
         super().__init__()
+        self.logger = get_dist_logger()
 
         assert (
             dist.get_world_size() % (tp_size * pp_size) == 0
@@ -1050,14 +1054,17 @@ def __init__(
                     tp_size > 1
                 ), f"Sequence parallelism mode {self.sequence_parallelism_mode} must be enabled when using tensor parallelism"
                 if sp_size != 1:
-                    warnings.warn(
-                        f"The sp_size will be the same as tp_size in sequence parallelism mode {self.sequence_parallelism_mode}, will ignore the given sequence parallelism size."
+                    self.logger.warning(
+                        f"The sp_size will be the same as tp_size in sequence parallelism mode {self.sequence_parallelism_mode}, will ignore the given sequence parallelism size.",
+                        ranks=[0],
                     )
                 self.sp_size = 1
                 self.dp_size = dist.get_world_size() // (tp_size * pp_size)
-            elif self.sequence_parallelism_mode in ["all_to_all"]:
+            elif self.sequence_parallelism_mode in ["all_to_all", "ring_attn"]:
                 self.sp_size = 1 if sp_size is None else sp_size
                 self.dp_size = dist.get_world_size() // (self.sp_size * pp_size * tp_size)
+                if self.sequence_parallelism_mode == "ring_attn":
+                    enable_flash_attention = True
         else:
             self.dp_size = dist.get_world_size() // (tp_size * pp_size)
             assert (
@@ -1079,9 +1086,21 @@ def __init__(
         if dp_outside:
             self.dp_axis, self.pp_axis, self.tp_axis, self.sp_axis = 0, 1, 2, 3
             self.pg_mesh = ProcessGroupMesh(self.dp_size, self.pp_size, self.tp_size, self.sp_size)
+            if sequence_parallelism_mode == "ring_attn":
+                # Swap tp and sp since 2D Ring has better inter-node latency
+                self.pg_mesh = ProcessGroupMesh(self.dp_size, self.pp_size, self.sp_size, self.tp_size)
+                self.sp_axis = 2
+                self.tp_axis = 3
+            else:
+                self.pg_mesh = ProcessGroupMesh(self.dp_size, self.pp_size, self.tp_size, self.sp_size)
         else:
             self.pp_axis, self.dp_axis, self.tp_axis, self.sp_axis = 0, 1, 2, 3
-            self.pg_mesh = ProcessGroupMesh(self.pp_size, self.dp_size, self.tp_size, self.sp_size)
+            if sequence_parallelism_mode == "ring_attn":
+                self.pg_mesh = ProcessGroupMesh(self.pp_size, self.dp_size, self.sp_size, self.tp_size)
+                self.sp_axis = 2
+                self.tp_axis = 3
+            else:
+                self.pg_mesh = ProcessGroupMesh(self.pp_size, self.dp_size, self.tp_size, self.sp_size)
 
         self.stage_manager = None
         self.schedule = None
@@ -1125,6 +1144,13 @@ def __init__(
                 )
             else:
                 raise NotImplementedError()
+        if sequence_parallelism_mode == "ring_attn":
+            if not parallel_output:
+                self.logger.warning(
+                    "parallel_output must be True for Zigzag Ring Attention, as we've not supported Zigzag all-gather yet.",
+                    ranks=[0],
+                )
+                parallel_output = True
 
         self.tp_group = self.pg_mesh.get_group_along_axis(self.tp_axis)
         self.dp_group = self.pg_mesh.get_group_along_axis(self.dp_axis)
@@ -1150,6 +1176,7 @@ def __init__(
             make_vocab_size_divisible_by=make_vocab_size_divisible_by,
             gradient_checkpoint_config=gradient_checkpoint_config,
             fp8_communication=fp8_communication,
+            inner_ring_size=inner_ring_size,
         )
         self.amp_config = dict(
             initial_scale=initial_scale,
@@ -1229,20 +1256,23 @@ def configure(
         optimizer = cast_to_distributed(optimizer)
 
         if isinstance(optimizer, DistGaloreAwamW) and zero_stage > 0 and self.dp_size > 0:
-            warnings.warn("Galore is only supported for Tensor Parallel and vanilla Data Parallel yet. Disabling ZeRO.")
+            self.logger.warning(
+                "Galore is only supported for Tensor Parallel and vanilla Data Parallel yet. Disabling ZeRO.",
+                ranks=[0],
+            )
             zero_config["partition_grad"] = False
             zero_stage = 0
 
         if not isinstance(model, ModelWrapper):
+            # Shouldn't use pp (frequent grad accumulation) with torch ddp
             use_ddp = (self.dp_size > 1 and self.pp_size == 1 and self.zero_stage == 0) or (
-                self.dp_size == 1
-                and self.pp_size == 1
-                and self.enable_sequence_parallelism
-                and self.sequence_parallelism_mode == "all_to_all"
+                self.dp_size == 1 and self.pp_size == 1
             )
             # sync gradients across DP * SP ranks
-            if self.enable_sequence_parallelism and self.sequence_parallelism_mode == "all_to_all":
+            # Apply Hybrid ZeRO across DP * SP ranks
+            if self.enable_sequence_parallelism and not is_share_sp_tp(self.sequence_parallelism_mode):
                 dp_group = self.pg_mesh.create_group_along_axis([self.dp_axis, self.sp_axis])
+                self.dp_size = get_world_size(dp_group)
             else:
                 dp_group = self.dp_group
             model = HybridParallelModule(
@@ -1286,9 +1316,10 @@ def configure(
             else:
                 is_zero = self.dp_size > 1
                 if self.dp_size == 1:
-                    warnings.warn(
+                    self.logger.warning(
                         "Use Zero Optimizer when data parallel size is 1 may introduce unnecessary overhead. "
-                        "If you do not intend to use cpu_offload, please consider set zero_stage=0."
+                        "If you do not intend to use cpu_offload, please consider set zero_stage=0.",
+                        ranks=[0],
                     )
 
                 assert self.precision != "fp32", "Please set precision to 'fp16' or 'bf16' when using ZeRO."
@@ -1331,7 +1362,7 @@ def execute_pipeline(
         assert self.enable_pipeline_parallelism, "pipeline parallelism is not enabled"
 
         if return_outputs:
-            warnings.warn("return_outputs may lead to significant extra memory consumption.")
+            self.logger.warning("return_outputs may lead to significant extra memory consumption.", ranks=[0])
 
         # Create a context for gradient synchronization based on the optimizer type.
         # If it's a HybridParallelZeroOptimizer, use optimizer.no_sync(); otherwise, use model.no_sync().
@@ -1446,7 +1477,7 @@ def enable_lora(
         assert not isinstance(model, HybridParallelModule), "Lora should be enabled before boosting the model."
         assert self.pp_size == 1 and self.tp_size == 1
         self.lora_enabled = True
-        warnings.warn("You have enabled LoRa training. Please check the hyperparameters such as lr")
+        self.logger.warning("You have enabled LoRa training. Please check the hyperparameters such as lr", ranks=[0])
 
         if bnb_quantization_config is not None:
             model = quantize_model(model, bnb_quantization_config)
diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py
index 63d46f6f8a2e..4188491c2e3c 100644
--- a/colossalai/booster/plugin/low_level_zero_plugin.py
+++ b/colossalai/booster/plugin/low_level_zero_plugin.py
@@ -1,7 +1,5 @@
 import enum
-import logging
 import os
-import warnings
 from contextlib import nullcontext
 from functools import partial
 from pathlib import Path
@@ -33,6 +31,7 @@
 )
 from colossalai.interface import AMPModelMixin, ModelWrapper, OptimizerWrapper
 from colossalai.interface.optimizer import DistributedOptim
+from colossalai.logging import get_dist_logger
 from colossalai.nn.optimizer import DistGaloreAwamW, cast_to_distributed
 from colossalai.quantization import BnbQuantizationConfig, quantize_model
 from colossalai.quantization.fp8_hook import FP8Hook
@@ -146,7 +145,7 @@ def save_sharded_optimizer(
         """
         assert isinstance(optimizer, LowLevelZeroOptimizer), "Please boost the optimizer before saving!"
         if os.path.isfile(checkpoint):
-            logging.error(f"Provided path ({checkpoint}) should be a directory, not a file")
+            self.logger.error(f"Provided path ({checkpoint}) should be a directory, not a file", ranks=[0])
             return
 
         Path(checkpoint).mkdir(parents=True, exist_ok=True)
@@ -183,10 +182,11 @@ def save_sharded_optimizer(
         index_file.append_meta_data("total_size", total_size)
         if self.coordinator.is_master():
             index_file.write_index_file(save_index_file)
-        logging.info(
+        self.logger.info(
             f"The optimizer is going to be split to checkpoint shards. "
             f"You can find where each parameters has been saved in the "
-            f"index located at {save_index_file}."
+            f"index located at {save_index_file}.",
+            ranks=[0],
         )
 
     def load_sharded_optimizer(self, optimizer: OptimizerWrapper, index_file_path: str, prefix: str):
@@ -273,7 +273,7 @@ def save_sharded_model(
 
     def save_lora_as_pretrained(self, model, checkpoint, use_safetensors):
         if os.path.isfile(checkpoint):
-            logging.error(f"Provided path ({checkpoint}) should be a directory, not a file")
+            self.logger.error(f"Provided path ({checkpoint}) should be a directory, not a file", ranks=[0])
             return
         from peft import PeftModel
 
@@ -371,8 +371,8 @@ def __init__(
         )
         self.lora_enabled = False
         self.verbose = verbose
+        self.logger = get_dist_logger()
         self.use_fp8 = use_fp8
-
         # set class name with stage, for better error message
         setattr(self.__class__, "__name__", f"LowLevelZeroPlugin_ZeRO-{stage}")
 
@@ -408,7 +408,7 @@ def enable_lora(
 
         assert not isinstance(model, LowLevelZeroModel), "Lora should be enabled before boosting the model."
         self.lora_enabled = True
-        warnings.warn("You have enabled LoRa training. Please check the hyperparameters such as lr")
+        self.logger.warning("You have enabled LoRa training. Please check the hyperparameters such as lr", ranks=[0])
 
         if bnb_quantization_config is not None:
             model = quantize_model(model, bnb_quantization_config)
@@ -457,8 +457,9 @@ def add_lora_params_to_optimizer(self, model, optimizer):
                 origin_param = name2param[origin_key]
                 group_id, check_state = self.get_param_group_id(optimizer, origin_param, param)
                 if check_state == OptimizerParamCheckState.ORIGIN_PARAM_NOT_FIND:
-                    warnings.warn(
-                        f"Origin parameter {origin_key} related to {name} doesn't exist in optimizer param_groups."
+                    self.logger.warning(
+                        f"Origin parameter {origin_key} related to {name} doesn't exist in optimizer param_groups.",
+                        ranks=[0],
                     )
                 elif (
                     check_state == OptimizerParamCheckState.ORIGIN_PARAM_FINDED
@@ -501,7 +502,10 @@ def configure(
         optimizer = cast_to_distributed(optimizer)
 
         if isinstance(optimizer, DistGaloreAwamW) and zero_stage > 0 and dp_size > 0:
-            warnings.warn("Galore is only supported for Tensor Parallel and vanilla Data Parallel yet. Disabling ZeRO.")
+            self.logger.warning(
+                "Galore is only supported for Tensor Parallel and vanilla Data Parallel yet. Disabling ZeRO.",
+                ranks=[0],
+            )
             zero_optim_kwargs["partition_grad"] = False
             zero_stage = 0
 
diff --git a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
index 374fc653556e..74d35f5c5505 100644
--- a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
@@ -1,4 +1,3 @@
-import warnings
 from collections import defaultdict
 from types import MethodType
 from typing import Callable, List, Optional, OrderedDict, Tuple
@@ -26,6 +25,7 @@
 from colossalai.cluster.process_group_mesh import ProcessGroupMesh
 from colossalai.interface import ModelWrapper, OptimizerWrapper
 from colossalai.interface.optimizer import DistributedOptim
+from colossalai.logging import get_dist_logger
 from colossalai.nn.optimizer import cast_to_distributed
 from colossalai.pipeline.schedule.interleaved_pp import InterleavedSchedule
 from colossalai.pipeline.schedule.one_f_one_b import OneForwardOneBackwardSchedule
@@ -217,12 +217,14 @@ def __init__(
         fp8_communication: bool = False,
         use_fp8: bool = False,
     ) -> None:
+        self.logger = get_dist_logger()
         if overlap_communication or zero_stage == 2:
             overlap_communication = False
             zero_stage = 1
-            warnings.warn(
+            self.logger.warning(
                 f"overlap_communication and zero_stage are set to False and 1 because "
-                f"ZeRO-2 or comm overlap cause program hang when some experts are not routed. "
+                f"ZeRO-2 or comm overlap cause program hang when some experts are not routed.",
+                ranks=[0],
             )
 
         assert (
@@ -240,8 +242,10 @@ def __init__(
                     tp_size > 1
                 ), f"Sequence parallelism mode {self.sequence_parallelism_mode} must be enabled when using tensor parallelism"
                 if sp_size != 1:
-                    warnings.warn(
-                        f"The sp_size will be the same as tp_size in sequence parallelism mode {self.sequence_parallelism_mode}, will ignore the given sequence parallelism size."
+                    self.logger.warning(
+                        f"The sp_size will be the same as tp_size in sequence parallelism mode {self.sequence_parallelism_mode},"
+                        "will ignore the given sequence parallelism size.",
+                        ranks=[0],
                     )
                 self.sp_size = 1
                 self.dp_size = dist.get_world_size() // (tp_size * pp_size)
@@ -326,6 +330,7 @@ def __init__(
         else:
             self.sp_group = self.pg_mesh.get_group_along_axis(self.sp_axis)
         self.use_fp8 = use_fp8
+
         self.shard_config = ShardConfig(
             tensor_parallel_process_group=self.tp_group,
             sequence_parallel_process_group=self.sp_group,
@@ -403,8 +408,9 @@ def configure(
                 and self.sequence_parallelism_mode == "all_to_all"
             )
             if use_ddp:
-                warnings.warn(
-                    f"Will have to check all params are used in pytorch DDP since not all experts are always activated"
+                self.logger.warning(
+                    f"Will have to check all params are used in pytorch DDP since not all experts are always activated",
+                    ranks=[0],
                 )
                 self.ddp_config["find_unused_parameters"] = True
 
@@ -461,9 +467,10 @@ def configure(
                     )
             else:
                 if self.dp_size <= 1:
-                    warnings.warn(
+                    self.logger.warning(
                         "Use Zero Optimizer when data parallel size is 1 may introduce unnecessary overhead. "
-                        "If you do not intend to use cpu_offload, please consider set zero_stage=0."
+                        "If you do not intend to use cpu_offload, please consider set zero_stage=0.",
+                        ranks=[0],
                     )
                 assert self.precision != "fp32", "Please set precision to 'fp16' or 'bf16' when using ZeRO."
                 optimizer = MoeHybridParallelZeroOptimizer(
diff --git a/colossalai/booster/plugin/torch_ddp_plugin.py b/colossalai/booster/plugin/torch_ddp_plugin.py
index 34caa2f684ba..61d785a4c9e7 100644
--- a/colossalai/booster/plugin/torch_ddp_plugin.py
+++ b/colossalai/booster/plugin/torch_ddp_plugin.py
@@ -9,6 +9,7 @@
 from colossalai.checkpoint_io import CheckpointIO, GeneralCheckpointIO
 from colossalai.cluster import DistCoordinator
 from colossalai.interface import ModelWrapper, OptimizerWrapper
+from colossalai.logging import get_dist_logger
 from colossalai.quantization import BnbQuantizationConfig, quantize_model
 from colossalai.utils import get_current_device
 
@@ -21,6 +22,7 @@ class TorchDDPCheckpointIO(GeneralCheckpointIO):
     def __init__(self) -> None:
         super().__init__()
         self.coordinator = DistCoordinator()
+        self.logger = get_dist_logger()
 
     def load_unsharded_model(self, model: ModelWrapper, checkpoint: str, strict: bool = True):
         """
diff --git a/colossalai/booster/plugin/torch_fsdp_plugin.py b/colossalai/booster/plugin/torch_fsdp_plugin.py
index e3f81928de5e..23a35bbcbd3b 100644
--- a/colossalai/booster/plugin/torch_fsdp_plugin.py
+++ b/colossalai/booster/plugin/torch_fsdp_plugin.py
@@ -1,6 +1,4 @@
-import logging
 import os
-import warnings
 from pathlib import Path
 from typing import Callable, Dict, Iterable, Iterator, List, Optional, Tuple
 
@@ -30,6 +28,7 @@
 from colossalai.checkpoint_io import CheckpointIndexFile, CheckpointIO, GeneralCheckpointIO, utils
 from colossalai.cluster import DistCoordinator
 from colossalai.interface import ModelWrapper, OptimizerWrapper
+from colossalai.logging import get_dist_logger
 
 from .dp_plugin_base import DPPluginBase
 
@@ -40,6 +39,7 @@ class TorchFSDPCheckpointIO(GeneralCheckpointIO):
     def __init__(self) -> None:
         super().__init__()
         self.coordinator = DistCoordinator()
+        self.logger = get_dist_logger()
 
     def load_unsharded_model(self, model: ModelWrapper, checkpoint: str, strict: bool):
         assert isinstance(model, TorchFSDPModel), "Please boost the model before loading!"
@@ -88,7 +88,7 @@ def save_sharded_model(
         """
         assert isinstance(model, TorchFSDPModel), "Please boost the model before saving!"
         if os.path.isfile(checkpoint_path):
-            logging.error(f"Provided path ({checkpoint_path}) should be a directory, not a file")
+            self.logger.error(f"Provided path ({checkpoint_path}) should be a directory, not a file")
             return
 
         Path(checkpoint_path).mkdir(parents=True, exist_ok=True)
@@ -117,7 +117,7 @@ def save_sharded_model(
             index_file.append_meta_data("total_size", total_size)
             index_file.write_index_file(save_index_file)
             utils.save_config_file(model.unwrap(), checkpoint_path)
-            logging.info(
+            self.logger.info(
                 f"The model is split into checkpoint shards. "
                 f"You can find where each parameters has been saved in the "
                 f"index located at {save_index_file}."
@@ -162,7 +162,7 @@ def save_sharded_optimizer(
         assert isinstance(optimizer, FSDPOptimizerWrapper), "Please boost the optimizer before saving!"
 
         if os.path.isfile(checkpoint):
-            logging.error(f"Provided path ({checkpoint}) should be a directory, not a file")
+            self.logger.error(f"Provided path ({checkpoint}) should be a directory, not a file")
             return
 
         Path(checkpoint).mkdir(parents=True, exist_ok=True)
@@ -200,7 +200,7 @@ def save_sharded_optimizer(
 
             index_file.append_meta_data("total_size", total_size)
             index_file.write_index_file(save_index_file)
-            logging.info(
+            self.logger.info(
                 f"The optimizer is going to be split to checkpoint shards. "
                 f"You can find where each parameters has been saved in the "
                 f"index located at {save_index_file}."
@@ -313,6 +313,7 @@ def __init__(
                 sync_module_states=sync_module_states,
             )
             self.fp8_communication = fp8_communication
+            self.logger = get_dist_logger()
 
     else:
         raise RuntimeError("FSDP is not supported while torch version under 1.12.0.")
@@ -364,7 +365,7 @@ def configure(
 
         if optimizer is not None:
             if len(optimizer.param_groups) > 1:
-                warnings.warn(
+                self.logger.warning(
                     "TorchFSDPPlugin does not support optimizer that use multi param groups. The results may not be as expected if used."
                 )
             optimizer.__init__(fsdp_model.parameters(), **optimizer.defaults)
diff --git a/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py b/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
index 0310df5489b0..043e5c2b0618 100644
--- a/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
+++ b/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
@@ -203,7 +203,6 @@ def save_sharded_model(
             return
 
         Path(checkpoint).mkdir(parents=True, exist_ok=True)
-
         # Devices along the same dp_group share the same copies of model.
         # So only let the device with dp_rank == 0 save the model.
         if self.dp_rank != 0:
@@ -643,14 +642,12 @@ def save_unsharded_model(self, model: ModelWrapper, checkpoint: str, gather_dten
         assert isinstance(model, ModelWrapper), "Please boost the model before saving!"
         model._force_wait_all_gather()
         model = model.unwrap()
-
         if self.dp_rank != 0:
             return
 
         # The logic of collecting parameter shards along tp degree
         # has been implemented by _save_to_state_dict method of ParallelModule in Shardformer.
         state_dict = model.state_dict()
-
         if self.pp_size == 1:
             # When pipeline is not used, let master rank directly save the collected state_dict.
             if self.tp_rank == 0:
@@ -660,7 +657,6 @@ def save_unsharded_model(self, model: ModelWrapper, checkpoint: str, gather_dten
             state_dict_list = [None for _ in range(self.pp_size)]
             dist.barrier(self.pp_group)
             dist.all_gather_object(state_dict_list, state_dict, self.pp_group)
-
             # Only the master rank do the saving.
             if self.coordinator.is_master():
                 complete_state_dict = dict()
diff --git a/colossalai/lazy/pretrained.py b/colossalai/lazy/pretrained.py
index 736ffc5e4ea2..226951598aa2 100644
--- a/colossalai/lazy/pretrained.py
+++ b/colossalai/lazy/pretrained.py
@@ -62,7 +62,6 @@ def new_from_pretrained(
     config = kwargs.pop("config", None)
     cache_dir = kwargs.pop("cache_dir", None)
     force_download = kwargs.pop("force_download", False)
-    resume_download = kwargs.pop("resume_download", False)
     proxies = kwargs.pop("proxies", None)
     local_files_only = kwargs.pop("local_files_only", False)
     use_auth_token = kwargs.pop("use_auth_token", None)
@@ -116,7 +115,6 @@ def new_from_pretrained(
             cache_dir=cache_dir,
             return_unused_kwargs=True,
             force_download=force_download,
-            resume_download=resume_download,
             proxies=proxies,
             local_files_only=local_files_only,
             use_auth_token=use_auth_token,
@@ -195,7 +193,6 @@ def new_from_pretrained(
                     "cache_dir": cache_dir,
                     "force_download": force_download,
                     "proxies": proxies,
-                    "resume_download": resume_download,
                     "local_files_only": local_files_only,
                     "use_auth_token": use_auth_token,
                     "user_agent": user_agent,
@@ -312,7 +309,6 @@ def new_from_pretrained(
                 pretrained_model_name_or_path,
                 cache_dir=cache_dir,
                 force_download=force_download,
-                resume_download=resume_download,
                 proxies=proxies,
                 local_files_only=local_files_only,
                 use_auth_token=use_auth_token,
diff --git a/colossalai/legacy/moe/openmoe/model/openmoe_policy.py b/colossalai/legacy/moe/openmoe/model/openmoe_policy.py
index ccd566b08594..d5824afcba91 100644
--- a/colossalai/legacy/moe/openmoe/model/openmoe_policy.py
+++ b/colossalai/legacy/moe/openmoe/model/openmoe_policy.py
@@ -171,7 +171,7 @@ def module_policy(self):
         policy = super().module_policy()
 
         if self.shard_config.enable_tensor_parallelism:
-            # add a new item for casual lm
+            # add a new item for causal lm
             # TODO: recursively assign ep group foe all modules
             new_item = {
                 OpenMoeForCausalLM: ModulePolicyDescription(
diff --git a/colossalai/legacy/nn/layer/parallel_1d/_operation.py b/colossalai/legacy/nn/layer/parallel_1d/_operation.py
index 8b8f04ccf456..e892336bcf87 100644
--- a/colossalai/legacy/nn/layer/parallel_1d/_operation.py
+++ b/colossalai/legacy/nn/layer/parallel_1d/_operation.py
@@ -81,6 +81,9 @@ def backward(ctx, grad_output):
             handle = dist.all_reduce(grad_input, group=gpc.get_group(ctx.parallel_mode), async_op=True)
             # Delay the start of weight gradient computation shortly (3us) to have
             # all-reduce scheduled first and have GPU resources allocated
+            # TODO: This seems to only work if you add torch.cuda.Event.wait()
+
+            # _ = torch.zeros(1, device=grad_output.device)
 
         grad_weight = grad_output.t().matmul(total_input)
         grad_bias = grad_output.sum(dim=0) if use_bias else None
diff --git a/colossalai/logging/logger.py b/colossalai/logging/logger.py
index eb5f28e2a3cf..9f4b7a7b0f3c 100644
--- a/colossalai/logging/logger.py
+++ b/colossalai/logging/logger.py
@@ -64,7 +64,10 @@ def __init__(self, name):
             self._logger.propagate = False
 
             DistributedLogger.__instances[name] = self
-        self.rank = dist.get_rank() if dist.is_initialized() else 0
+
+    @property
+    def rank(self):
+        return dist.get_rank() if dist.is_initialized() else 0
 
     @staticmethod
     def __get_call_info():
diff --git a/colossalai/pipeline/schedule/interleaved_pp.py b/colossalai/pipeline/schedule/interleaved_pp.py
index a7571c73139b..28c4eb8d8d4a 100644
--- a/colossalai/pipeline/schedule/interleaved_pp.py
+++ b/colossalai/pipeline/schedule/interleaved_pp.py
@@ -306,7 +306,6 @@ def forward_step(
         # for the first stage, input_obj is None
         # for other stages, input_obj is the output of the previous stage containing hidden_states etc.
         # Only attention_mask from micro_batch is used
-
         with self.stage_manager.switch_model_chunk_id(model_chunk_id):
             if isinstance(model_chunk, ModuleList):
                 output_obj = model_forward(model_chunk[model_chunk_id], micro_batch, input_obj)
diff --git a/colossalai/pipeline/schedule/one_f_one_b.py b/colossalai/pipeline/schedule/one_f_one_b.py
index 3269d67ba7d4..67aaa5eb1b59 100644
--- a/colossalai/pipeline/schedule/one_f_one_b.py
+++ b/colossalai/pipeline/schedule/one_f_one_b.py
@@ -271,6 +271,7 @@ def forward_step(
         output_obj = model_forward(model, micro_batch, input_obj)
         if self.stage_manager.is_last_stage():
             loss = criterion(output_obj, micro_batch) / self.num_microbatches
+
             if accum_loss is not None:
                 accum_loss.add_(loss.detach())
             if outputs is not None:
diff --git a/colossalai/shardformer/layer/__init__.py b/colossalai/shardformer/layer/__init__.py
index 331e4972966c..8882a33c15e6 100644
--- a/colossalai/shardformer/layer/__init__.py
+++ b/colossalai/shardformer/layer/__init__.py
@@ -1,5 +1,5 @@
 from ._operation import all_to_all_comm
-from .attn import AttnMaskType, ColoAttention
+from .attn import AttnMaskType, ColoAttention, RingAttention, get_pad_info
 from .dropout import DropoutForParallelInput, DropoutForReplicatedInput
 from .embedding import Embedding1D, PaddingEmbedding, VocabParallelEmbedding1D
 from .linear import Linear1D_Col, Linear1D_Row, PaddingLMHead, VocabParallelLMHead1D
@@ -31,5 +31,7 @@
     "VocabParallelLMHead1D",
     "AttnMaskType",
     "ColoAttention",
+    "RingAttention",
+    "get_pad_info",
     "all_to_all_comm",
 ]
diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index feebd2d0529d..f58a21df3614 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -2,6 +2,8 @@
 import torch.distributed as dist
 import torch.nn.functional as F
 
+from .utils import is_share_sp_tp
+
 try:
     import fused_mix_prec_layer_norm_cuda
 except:
@@ -105,7 +107,7 @@ def backward(ctx, grad_output):
         elif ctx.async_grad_allreduce:
             # Asynchronous all-reduce
             handle = dist.all_reduce(grad_input, group=ctx.process_group, async_op=True)
-            # Relay on CUDA_DEVICE_MAX_CONNECTIONS=1 to have
+            # Rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to have
             # all-reduce scheduled first and have GPU resources allocated, CUDA_DEVICE_MAX_CONNECTIONS=1 is set in shardformer.py
 
         grad_weight = total_input.t().matmul(grad_output)
@@ -353,7 +355,7 @@ def backward(ctx, grad_output):
                     input_.shape, dtype=input_parallel.dtype, device=input_parallel.device
                 ).contiguous()
                 handle = dist.reduce_scatter(output, input_list, group=process_group, async_op=True)
-                # Relay on CUDA_DEVICE_MAX_CONNECTIONS=1 to have
+                # Rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to have
                 # all-reduce scheduled first and have GPU resources allocated, CUDA_DEVICE_MAX_CONNECTIONS=1 is set in shardformer.py
 
             if _grad_accum_fusion_available and weight.grad is not None:
@@ -677,8 +679,8 @@ def backward(ctx, grad_output):
                     input_.shape, dtype=input_parallel.dtype, device=input_parallel.device
                 ).contiguous()
                 handle = dist.reduce_scatter(output, input_list, group=process_group, async_op=True)
-                # Relay on CUDA_DEVICE_MAX_CONNECTIONS=1 to have
-                # all-reduce scheduled first and have GPU resources allocated, CUDA_DEVICE_MAX_CONNECTIONS=1 is set in shardformer.py
+                # Rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to have
+                # all-reduce scheduled first and have GPU resources allocated
 
             grad_weight = total_input.t().matmul(grad_output)
             grad_bias = grad_output.sum(dim=0) if use_bias else None
@@ -760,16 +762,20 @@ class _ReduceForward(torch.autograd.Function):
 
     Args:
         input_: input matrix.
-        parallel_mode: parallel mode.
+        process_group: communication group.
+
     """
 
     @staticmethod
-    def forward(ctx, input_, process_group, fp8_communication=False):
+    def forward(ctx, input_, process_group, grad_scale=None, fp8_communication=False):
+        ctx.grad_scale = grad_scale
         return _reduce(input_, process_group, fp8_communication, fp8_format="e4m3")
 
     @staticmethod
     def backward(ctx, grad_output):
-        return grad_output, None, None
+        if ctx.grad_scale is not None:
+            grad_output = grad_output * ctx.grad_scale
+        return grad_output, None, None, None
 
 
 class _ReduceBackward(torch.autograd.Function):
@@ -1079,8 +1085,8 @@ def split_forward_gather_backward(input_, dim, process_group, grad_scale=None, f
     return _SplitForwardGatherBackward.apply(input_, dim, process_group, grad_scale, fp8_communication)
 
 
-def reduce_forward(input_, process_group, fp8_communication=False):
-    return _ReduceForward.apply(input_, process_group, fp8_communication)
+def reduce_forward(input_, process_group, grad_scale=None, fp8_communication=False):
+    return _ReduceForward.apply(input_, process_group, grad_scale, fp8_communication)
 
 
 def reduce_backward(input_, process_group, fp8_communication=False):
@@ -1089,3 +1095,15 @@ def reduce_backward(input_, process_group, fp8_communication=False):
 
 def all_to_all_comm(input_, process_group=None, scatter_dim=2, gather_dim=1, fp8_communication=False):
     return _AllToAll.apply(input_, process_group, scatter_dim, gather_dim, fp8_communication)
+
+
+def gather_sp_output(hidden_states, sp_group, sp_mode, fp8_communication=False):
+    """
+    Gather the output of the last layer for cross entropy computation
+    """
+    # Rescale grad (HybridParallelPlugin applies ZeRO grad averaging on the DP * SP group)
+    scale = None if is_share_sp_tp(sp_mode) else dist.get_world_size(sp_group)
+    hidden_states = gather_forward_split_backward(
+        hidden_states, 1, sp_group, grad_scale=scale, fp8_communication=fp8_communication
+    )
+    return hidden_states
diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 5872c64856b9..b102387a0840 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -2,7 +2,10 @@
 from typing import Callable, Dict, Optional, Tuple
 
 import torch
+import torch.distributed
+import torch.distributed as dist
 import torch.nn.functional as F
+from einops import rearrange
 
 from colossalai.kernel.kernel_loader import (
     FlashAttentionForFloatAndCustomMaskLoader,
@@ -10,12 +13,18 @@
     FlashAttentionWithCustomMaskLoader,
     KernelLoader,
 )
+from colossalai.logging import get_dist_logger
+
+from .utils import RingComm, get_half_index, split_varlen_zigzag
 
 __all__ = [
     "AttnMaskType",
     "ColoAttention",
 ]
 
+_flash_attn_forward = _flash_attn_backward = None
+_unpad_input = _pad_input = None
+
 
 class AttnMaskType(Enum):
     CUSTOM = 0
@@ -38,20 +47,32 @@ def invert_mask(mask: torch.Tensor) -> torch.Tensor:
 
 
 # adapted from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/bert_padding.py
-def get_pad_info(padding_mask: torch.Tensor) -> Tuple[int, torch.Tensor, torch.Tensor]:
+def get_pad_info(
+    padding_mask: torch.Tensor, invert: Optional[bool] = False, return_indices: Optional[bool] = True
+) -> Tuple[int, torch.Tensor, torch.Tensor]:
     """Get padding information from padding mask.
 
     Args:
-        padding_mask (torch.Tensor): Padding mask tensor. Shape should be [B, S]
+        padding_mask (torch.Tensor): Padding mask tensor. Shape should be [B, Skv]
+        invert (Optional[bool], optional): Whether to reverse the padding mask.
+        return_indices (Optional[bool], optional): Whether to return the indices of non-masked tokens.
 
     Returns:
-        Tuple[int, torch.Tensor, torch.Tensor]: Tuple of (max_seq_len, cu_seqlens, indices)
+        max_seqlen_in_batch (int): Maximum sequence length in the batch.
+        cu_seqlens (torch.Tensor): Shape [B+1]. Cumulative sequence lengths of the sequences in the batch.
+        indices (torch.Tensor): Shape [total_nonzero]. The indices of non-masked tokens from the flattened input sequence.
     """
+    if invert:
+        padding_mask = padding_mask.logical_not()
     seqlens_in_batch = padding_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(padding_mask.flatten(), as_tuple=False).flatten()
+    if return_indices:
+        indices = torch.nonzero(padding_mask.flatten(), as_tuple=False).flatten()
+
     max_seqlen_in_batch = seqlens_in_batch.max().item()
     cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
-    return max_seqlen_in_batch, cu_seqlens, indices
+    if return_indices:
+        return max_seqlen_in_batch, cu_seqlens, indices
+    return max_seqlen_in_batch, cu_seqlens
 
 
 class ColoAttention:
@@ -107,6 +128,7 @@ def prepare_attn_kwargs(
         q_padding_mask: Optional[torch.Tensor] = None,
         kv_padding_mask: Optional[torch.Tensor] = None,
         is_causal: bool = False,
+        invert: bool = True,
     ) -> Dict[str, torch.Tensor]:
         """Return a dictionary of keyword arguments for attention function. It supports 4 mask type.
         1. custom mask: no padding mask and is_causal=False, return {}, users should handle attention mask by themselves.
@@ -124,7 +146,7 @@ def prepare_attn_kwargs(
                 The shape should be [B, Skv]. ``1`` means valid token, and ``0`` means padding token.
                 If it's None and ``q_padding_mask`` is not None, it will be set to ``q_padding_mask``. Defaults to None.
             is_causal (bool, optional): Whether to use causal attention mask. Defaults to False.
-
+            invert_mask (bool, optional): Whether to invert the mask. Defaults to True.
         Returns:
             Dict[str, torch.Tensor]: Dictionary of keyword arguments for attention function.
         """
@@ -143,6 +165,10 @@ def prepare_attn_kwargs(
             if s_q != 1:
                 attention_mask = attention_mask.tril(diagonal=0)
             attention_mask = attention_mask.expand(b, s_q, s_kv)
+            attention_mask = torch.ones(s_q, s_kv, dtype=dtype, device=device)
+            if s_q != 1:
+                attention_mask = attention_mask.tril(diagonal=0)
+            attention_mask = attention_mask.expand(b, s_q, s_kv)
         else:
             max_seqlen_q, cu_seqlens_q, q_indices = get_pad_info(q_padding_mask)
             if kv_padding_mask is None:
@@ -154,7 +180,7 @@ def prepare_attn_kwargs(
             assert kv_padding_mask.shape == (
                 b,
                 s_kv,
-            ), f"q_padding_mask shape {kv_padding_mask.shape} should be the same. ({shape_4d})"
+            ), f"Padding mask shape {kv_padding_mask.shape} should align with shape 4d ({b}, {s_kv})"
             attention_mask = kv_padding_mask[:, None, :].expand(b, s_q, s_kv).to(dtype=dtype, device=device)
             outputs.update(
                 {
@@ -172,7 +198,8 @@ def prepare_attn_kwargs(
                     attention_mask = attention_mask * attention_mask.new_ones(s_q, s_kv).tril(diagonal=0)
             else:
                 outputs["attention_mask_type"] = AttnMaskType.PADDED
-        attention_mask = invert_mask(attention_mask).unsqueeze(1)
+        if invert:
+            attention_mask = invert_mask(attention_mask).unsqueeze(1)
         outputs["attention_mask"] = attention_mask
         return outputs
 
@@ -191,6 +218,7 @@ def attention(
         kv_indices: Optional[torch.Tensor] = None,
         dropout_p: float = 0.0,
         scale: Optional[float] = None,
+        **kwargs,
     ) -> torch.Tensor:
         """Flash Attention function. It supports 4 mask type.
         1. custom mask: recv attention_mask
@@ -199,9 +227,9 @@ def attention(
         4. padded causal mask: recv attention_mask, attention_mask_type, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv, indices
 
         Args:
-            q (torch.Tensor): Query tensor. Shape should be [B, N, Sq, D]
-            k (torch.Tensor): Key tensor. Shape should be [B, N, Skv, D]
-            v (torch.Tensor): Value tensor. Shape should be [B, N, Skv, D]
+            q (torch.Tensor): Query tensor. Shape should be [B, nHeads, Sq, D]
+            k (torch.Tensor): Key tensor. Shape should be [B, nHeads, Skv, D]
+            v (torch.Tensor): Value tensor. Shape should be [B, nHeads, Skv, D]
             attention_mask (Optional[torch.Tensor], optional): Attention mask tensor. Shape should be [B, 1, Sq, Skv]. Defaults to None.
             attention_mask_type (AttnMaskType, optional): Attention mask type. Defaults to AttnMaskType.CUSTOM.
             cu_seqlens_q (Optional[torch.Tensor], optional): The cumulative sequence lengths
@@ -218,7 +246,7 @@ def attention(
             scale (Optional[float], optional): Scaling factor applied prior to softmax. Defaults to None.
 
         Returns:
-            torch.Tensor: Output tensor. Shape should be [B, N, Sq, D]
+            torch.Tensor: Output tensor. Shape should be [B, nHeads, Sq, D]
         """
         # known issue: sdpa does not support attention mask which contains whole row of masked tokens, which leads to nan
         # this case is usaul when padding mask is used and self attention is performed
@@ -252,6 +280,7 @@ def attention(
         else:
             # if attention_mask is None, attention_mask_type should be the default value
             assert attention_mask_type == AttnMaskType.CUSTOM
+
         # kernel dispatch
         mask_type = attention_mask_type if attention_mask is not None else None
         attn_func = ColoAttention._dispatch_kernel(q.dtype, mask_type)
@@ -274,3 +303,866 @@ def attention(
             q_indices=q_indices,
             kv_indices=kv_indices,
         )
+
+
+def _load_varlen_helpers():
+    """Helper to load functions for padding and unpadding packed sequences.
+    Use only when flash attn is installed
+    """
+    global _pad_input, _unpad_input
+    # Flash attn claims this is more efficient than torch's bool indexing due to avoiding
+    # broadcast
+    if _pad_input is None or _unpad_input is None:
+        try:
+            from flash_attn.bert_padding import index_first_axis, pad_input
+
+            def unpad_input(hidden_states: torch.Tensor, indices: torch.Tensor):
+                return index_first_axis(rearrange(hidden_states, "b s ... -> (b s) ..."), indices)
+
+            _pad_input = pad_input
+            _unpad_input = unpad_input
+        except ImportError as e:
+            raise RuntimeError(
+                f"Flash Attention is not installed. You can install it via 'pip install flash-attn --no-build-isolation'"
+            ) from e
+
+
+def _load_flash_attn():
+    """A light-weight loader to check whether flash-attn is installed.
+    Can't use ColoAttention._dispatch_kernel because we mutate the backward pass
+    """
+    global _flash_attn_forward, _flash_attn_backward
+    if _flash_attn_forward is None or _flash_attn_backward is None:
+        try:
+            from flash_attn.flash_attn_interface import _flash_attn_varlen_backward as _flash_attn_backward
+            from flash_attn.flash_attn_interface import _flash_attn_varlen_forward as _flash_attn_forward
+        except ImportError as e:
+            raise RuntimeError(
+                f"Flash Attention is not installed. You can install it via 'pip install flash-attn --no-build-isolation'"
+            ) from e
+
+    _load_varlen_helpers()
+
+
+# NOTE: This can cause spawned processes to hang on exit
+# with python 3.9
+@torch.compile()
+def _rescale_out_lse(out, block_out, lse, block_lse):
+    """
+    Compute the new attention denominator:
+        exp(lse) + exp(block_lse) = exp(max_scale) * (exp(min_scale - max_scale) + 1)
+    Args:
+        out: (T, H, D)
+        block_out: (T, H, D)
+        lse: (H, T, 1)
+        block_lse: (H, T, 1)
+    """
+
+    # min_scale = torch.min(lse, block_lse)
+    # max_scale = torch.max(lse, block_lse)
+    # new_lse = max_scale + torch.log(1 + torch.exp(min_scale - max_scale))
+
+    # NOTE: directly assigning to .data here is buggy
+    # probably due to casting dtypes/strides
+    new_lse = lse + torch.log(1 + torch.exp(block_lse - lse))
+
+    new_block_lse = torch.exp(block_lse - new_lse)
+    out = (torch.exp(lse - new_lse) * out + new_block_lse * block_out).to(out)
+    lse = new_lse
+
+    # Equivalent to the above
+    # See https://github.com/zhuzilin/ring-flash-attention/pull/34#issuecomment-2076126795
+    # out = (out - F.sigmoid(block_lse - lse) * (out - block_out))
+    # lse = (lse - F.logsigmoid(lse - block_lse))
+    return out, lse
+
+
+class RingAttention(torch.autograd.Function):
+    """Implements the Ring Attention from `Ring Attention with Blockwise Transformers for Near-Infinite Context`
+    (https://arxiv.org/abs/2310.01889).
+    For load-balancing we adopted the "zigzag" attention scheme from https://github.com/zhuzilin/ring-flash-attention/tree/main
+    For portable integration with more models, we don't follow the spirit of "block-wise FNN" in the original paper,
+    which requires fusing FFN with the Flash Attention kernel/function (see https://arxiv.org/pdf/2305.19370;
+    implemented in Jax and not optimized).
+    We adopt the double ring topology from LoongTrain (https://arxiv.org/pdf/2406.18485) to fully utilize available
+    NICs on each node, by computing attention within a inner ring first and then sending all KVs to the next
+    ring at once.
+    """
+
+    # Globle cache to avoid recomputation for same-lengthed sequences
+    CU_SEQLENS: torch.Tensor = None  # [B+1]
+    TOTAL_SEQLEN: int = None
+    HALF_INDICES: Tuple = None
+    SUPPORTED_MASK_TYPES = (AttnMaskType.CAUSAL, AttnMaskType.PADDED_CAUSAL)
+    ATTN_DONE: torch.cuda.Event = None
+    SP_STREAM: torch.cuda.Stream = None
+    SP_GROUP: dist.ProcessGroup = None
+    # duplicate process group for concurrent NCCL streams
+    # while both PyTorch and NCCL warns(https://github.com/pytorch/pytorch/commit/2dbe5cb979f674f0052a8eea1f7b6c3c0ba441d7)
+    # against this, in practice it seems to work fine.
+    INNER_RING_GROUP: dist.ProcessGroup = None
+    INNER_RING_GROUP_COPY: dist.ProcessGroup = None
+    INTER_RING_GROUP: dist.ProcessGroup = None
+    INTER_RING_GROUP_COPY: dist.ProcessGroup = None
+
+    @staticmethod
+    def get_double_ring_groups(sp_group, inner_ring_size=None):
+        """
+        Get 2D ring groups for the given process group. Generally, to avoid congestion, the inner ring size
+        shouldn't be larger than the number of NICs on each node.
+        Args:
+            sp_group (dist.ProcessGroup): Process group for sequence parallelism
+            inner_ring_size (Optional[int], optional): Inner ring size. Defaults to None.
+        Returns:
+            Tuple[dist.ProcessGroup, dist.ProcessGroup]: Inner-ring process group and inter-ring process group.
+        """
+        sp_size = dist.get_world_size(sp_group)
+        sp_rank = dist.get_rank(sp_group)
+
+        if inner_ring_size is None:
+            if torch.cuda.device_count() >= dist.get_world_size():
+                # single node, no need to consider NICs
+                return sp_group, sp_group
+            if sp_size <= 4:
+                inner_ring_size = min(2, sp_size)
+            else:
+                inner_ring_size = min(4, sp_size)
+        else:
+            assert (
+                inner_ring_size <= sp_size and sp_size % inner_ring_size == 0
+            ), f"Error: sp_size {sp_size} should be divisible by inner_ring_size {inner_ring_size}"
+
+        if inner_ring_size == sp_size:
+            return sp_group, sp_group
+        assert (
+            sp_size % inner_ring_size == 0
+        ), f"sp_size {sp_size} should be divisible by inner_ring_size {inner_ring_size}"
+
+        logger = get_dist_logger()
+        logger.info(
+            f"Using 2D Ring Attention with inner ring size {inner_ring_size} to maximze NIC util for inter-node comm. Cross your fingers for speed-ups!",
+            ranks=[0],
+        )
+        num_rings = sp_size // inner_ring_size
+        inner_ring_group = None
+        inter_ring_group = None
+
+        # Create inner ring groups
+        for i in range(inner_ring_size):
+            ranks = list(range(i * inner_ring_size, (i + 1) * inner_ring_size))
+            group = dist.new_group(ranks)
+            if sp_rank in ranks:
+                inner_ring_group = group
+
+        # Create inter ring groups
+        for i in range(num_rings):
+            ranks = list(range(i, sp_size, num_rings))
+            group = dist.new_group(ranks)
+            if sp_rank in ranks:
+                inter_ring_group = group
+
+        return inner_ring_group, inter_ring_group
+
+    @staticmethod
+    def attention(
+        q,  # (B, H, Sq, D)
+        k,
+        v,
+        sp_group,
+        attention_mask_type,
+        cu_seqlens=None,
+        max_seqlen=None,
+        valid_indices=None,
+        dropout_p=0.0,
+        softmax_scale=None,
+        deterministic=False,
+        return_softmax=False,
+        inner_ring_size=None,
+        **kwargs,
+    ):
+        """
+        Ring Attention forward pass supporting variable-length sequences. When using varlen mode,
+        each sequence in the batch should have length divisible by sp_size * 2.
+
+        Args:
+            q (torch.Tensor): Query tensor. Shape should be [B, nHeads, Sq, D]
+            k (torch.Tensor): Key tensor. Shape should be [B, nHeads, Sq, Sq, D]
+            v (torch.Tensor): Value tensor. Shape should be [B, nHeads, Sq, Sq, D]
+            sp_group (Optional[dist.ProcessGroup]): Process group for sequence parallelism
+            sp_tream (torch.cuda.Stream): An different stream for output correction.
+            cu_seqlens (Optional[torch.Tensor], optional): The cumulative sequence lengths
+                of the sequences in the batch, used to index into q.
+                Shape should be [B+1].
+            max_seqlen (Optional[int], optional): Maximum query sequence length in the batch.
+            valid_indices (Optional[torch.Tensor], optional): The indices of non-masked tokens from get_pad_info.
+                Shape should be [t].
+            dropout_p (float, optional): Dropout probability. Defaults to 0.0.
+            softmax_scale (Optional[float], optional): Scaling factor applied prior to softmax.
+            deterministic (bool, optional): Whether to force deterministic backward pass. See https://github.com/Dao-AILab/flash-attention/issues/349
+            return_softmax (bool, optional): Whether to return the softmax denominator (logsumexp).
+            inner_ring_size (Optional[int], optional): Inner ring size of the 2D ring. By default use a heuristic to decide.
+
+        Returns:
+            out: Output tensor of shape [B, nHeads, Sq, D] or [T, nHeads, D] if pad_output is False.
+            softmax_lse: (if return_softmax is True) Softmax denominator (logsumexp).
+                Shape should be [total_q_seqlen, nHeads]
+        """
+        # Check input args
+        _load_flash_attn()
+        if RingAttention.ATTN_DONE is None:
+            RingAttention.ATTN_DONE = torch.cuda.Event()
+        if RingAttention.SP_STREAM is None:
+            RingAttention.SP_STREAM = torch.cuda.Stream()
+
+        assert (
+            q.shape[2] == k.shape[2]
+        ), "Q, K and V having different sequence lengths (inference or cross-attn)\
+            is not supported yet in training."
+        assert (
+            attention_mask_type in RingAttention.SUPPORTED_MASK_TYPES
+        ), f"Mask type {attention_mask_type} is not supported yet."
+
+        clone_pg = lambda pg: dist.new_group(dist.get_process_group_ranks(pg))
+
+        if RingAttention.SP_GROUP is not sp_group:
+            RingAttention.SP_GROUP = sp_group
+            inner_ring_group, inter_ring_group = RingAttention.get_double_ring_groups(sp_group, inner_ring_size)
+            RingAttention.INNER_RING_GROUP = inner_ring_group
+            RingAttention.INTER_RING_GROUP = inter_ring_group
+        else:
+            inner_ring_group = RingAttention.INNER_RING_GROUP
+            inter_ring_group = RingAttention.INTER_RING_GROUP
+
+        # (B, H, Sq, D) -> (B, Sq, H, D)
+        q, k, v = [x.transpose(1, 2).contiguous() for x in (q, k, v)]
+        pad_output = q.dim() == 4
+
+        # Get sequence length info for varlen forward
+        if attention_mask_type == AttnMaskType.CAUSAL:
+            # All sequences share the same length
+            b, sq, h, d = q.shape
+            max_seqlen = sq
+            # Cache to avoid recreation for a single sequence
+            if sq * b == RingAttention.TOTAL_SEQLEN:
+                cu_seqlens = RingAttention.CU_SEQLENS
+            else:
+                cu_seqlens = torch.arange(0, b * sq + 1, sq, device=q.device, dtype=torch.int32)
+                RingAttention.TOTAL_SEQLEN = b * sq
+
+        # "Packed" mode where sequences of different lengths are packed into [total_q_seqlen, H, D]
+        elif attention_mask_type == AttnMaskType.PADDED_CAUSAL:
+            assert (
+                cu_seqlens is not None and max_seqlen is not None and valid_indices is not None
+            ), "Packed mode requires pre-computed cu_seqlens and max_seq_len."
+            if pad_output:
+                b, sq, h, d = q.shape
+                q, k, v = [_unpad_input(x, valid_indices) for x in (q, k, v)]
+
+        out, softmax_lse = RingAttention.apply(
+            q,
+            k,
+            v,
+            sp_group,
+            RingAttention.SP_STREAM,
+            cu_seqlens,
+            max_seqlen,
+            dropout_p,
+            softmax_scale,
+            deterministic,
+            return_softmax,
+            attention_mask_type == AttnMaskType.PADDED_CAUSAL,
+            inner_ring_group,
+            inter_ring_group,
+        )
+
+        if attention_mask_type == AttnMaskType.PADDED_CAUSAL:
+            if pad_output:
+                out = _pad_input(out, valid_indices, b, sq)  # (T, ...) -> (B, Sq, ...)
+                out = out.transpose(1, 2)  # (B, Sq, H, D) -> (B, H, Sq, D)
+        else:
+            out = out.transpose(1, 2)
+
+        if return_softmax:
+            return out, softmax_lse
+        return out
+
+    @staticmethod
+    def forward(
+        ctx,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        sp_group: dist.ProcessGroup,
+        sp_stream: torch.cuda.Stream,
+        cu_seqlens: torch.Tensor,
+        max_seqlen: int,
+        dropout_p: float = 0.0,
+        softmax_scale: Optional[float] = None,
+        deterministic: Optional[bool] = False,
+        return_softmax: Optional[bool] = False,
+        is_packed: Optional[bool] = False,
+        inner_ring_group: Optional[dist.ProcessGroup] = None,
+        inter_ring_group: Optional[dist.ProcessGroup] = None,
+    ):
+
+        cu_seqlens_q = cu_seqlens_kv = cu_seqlens
+        max_seqlen_q = max_seqlen_kv = max_seqlen
+        cu_seqlens_half = cu_seqlens // 2
+        max_seqlen_half = max_seqlen // 2
+
+        misc_kwargs = {
+            "window_size": (-1, -1),
+            "alibi_slopes": None,
+            "softmax_scale": q.shape[-1] ** -0.5 if softmax_scale is None else softmax_scale,
+            "dropout_p": dropout_p,
+            "block_table": None,
+            "softcap": 0.0,
+            "return_softmax": False,
+        }
+
+        if (
+            RingAttention.HALF_INDICES is not None
+            and cu_seqlens.shape == RingAttention.CU_SEQLENS.shape
+            and (cu_seqlens == RingAttention.CU_SEQLENS).all()
+        ):
+            half_idx_front, half_idx_back = RingAttention.HALF_INDICES
+        else:
+            half_idx_front = get_half_index(cu_seqlens, front=True)
+            half_idx_back = get_half_index(cu_seqlens, front=False)
+            RingAttention.HALF_INDICES = (half_idx_front, half_idx_back)
+            RingAttention.CU_SEQLENS = cu_seqlens
+
+        if is_packed:
+            t, h, d = q.shape
+        else:
+            b, sq, h, d = q.shape
+            t = b * sq
+            # Be careful about GQA/MQA in reshape
+            q, k, v = [x.view(t, *x.shape[-2:]) for x in (q, k, v)]
+
+        if inner_ring_group is None or inter_ring_group is None:
+            # Use one ring if not specified
+            inner_ring_group = inter_ring_group = sp_group
+
+        sp_size = dist.get_world_size(sp_group)
+        sp_rank = dist.get_rank(sp_group)
+        # Attempt to achieve concurrent comm in the two-stream forward
+        local_kv_comms = [RingComm(inner_ring_group) for _ in range(2)]
+        inter_ring_comm = RingComm(inter_ring_group)
+        local_sp_size = dist.get_world_size(inner_ring_group)
+        local_sp_rank = dist.get_rank(inner_ring_group)
+        inter_ring_rank = dist.get_rank(inter_ring_group) if inter_ring_group is not sp_group else 0
+        num_rings = dist.get_world_size(inter_ring_group) if inter_ring_group is not sp_group else 1
+
+        # Non-contiguous indexing copies to a new contiguous tensor,
+        # so only do it once
+        if sp_rank != sp_size - 1:
+            q1 = q[half_idx_back]
+
+        # Pre-allocate double buffer for overlapping and receiving next step's inputs
+        kv_buffers = [torch.stack((k, v))]  # (2, B, Sq, H, D)
+        kv_buffers.append(torch.empty_like(kv_buffers[0]))
+
+        # outputs
+        out = None
+        block_out = [None, None]
+        softmax_lse = [None, None]
+        block_softmax_lse = [None, None]  # log sum exp, the denominator of softmax in attention
+        rng_states = [None for _ in range(sp_size)]
+        sp_streams = [torch.cuda.current_stream(), sp_stream]
+
+        def _forward(q, k, v, causal):
+            (
+                _,
+                _,
+                _,
+                _,
+                out,
+                softmax_lse,
+                _,
+                rng_state,
+            ) = _flash_attn_forward(
+                q,
+                k,
+                v,
+                cu_seqlens_q if q.shape[0] == t else cu_seqlens_half,
+                cu_seqlens_kv if k.shape[0] == t else cu_seqlens_half,
+                max_seqlen_q if q.shape[0] == t else max_seqlen_half,
+                max_seqlen_kv if k.shape[0] == t else max_seqlen_half,
+                causal=causal,
+                **misc_kwargs,
+            )
+            return out, softmax_lse, rng_state
+
+        def _kv_comm(i):
+            # Avoid overwriting attn input when it shares mem with buffer
+            if not RingAttention.ATTN_DONE.query():
+                kv_buffers[(i + 1) % 2] = torch.empty_like(kv_buffers[i % 2])
+            if i < local_sp_size - 1:
+                local_kv_comms[i % 2].send_recv(kv_buffers[i % 2], kv_buffers[(i + 1) % 2])
+
+        def _local_ring_forward():
+            # (Hopefully) overlap output correction with next flash attn
+            for i in range(local_sp_size):
+                with torch.cuda.stream(sp_streams[i % 2]):
+                    # Wait for current kv from prev rank
+                    # NOTE: waiting outside the current stream will NOT correctly synchronize.
+                    if i > 0:
+                        local_kv_comms[(i + 1) % 2].wait()
+                    if i == 0:
+                        _kv_comm(i)
+
+                    if i == 0:
+                        # Compute with local KV; no mask
+                        kv_block = kv_buffers[0]
+                        q_block = q
+                        (block_out[i % 2], block_softmax_lse[i % 2], rng_states[i]) = _forward(  # (T, H, D)  # (H, T)
+                            q_block, kv_block[0], kv_block[1], causal=True
+                        )
+                    elif i <= local_sp_rank:
+                        # Received the "surrounding" kv chunks
+                        # Drop the second half of received kv
+                        # (2, t // 2, H, D)
+                        kv_block = kv_buffers[i % 2][:, half_idx_front]
+                        q_block = q
+                        (
+                            block_out[i % 2],  # (T, H, D)
+                            block_softmax_lse[i % 2],  # (H, T)
+                            rng_states[i],
+                        ) = _forward(q_block, kv_block[0], kv_block[1], causal=False)
+                    else:
+                        # Received the inner kv chunks
+                        # Drop the first half of q
+                        kv_block = kv_buffers[i % 2]
+                        q_block = q1
+                        (
+                            block_out[i % 2],  # (T, H, D)
+                            block_softmax_lse[i % 2],  # (H, T)
+                            rng_states[i],
+                        ) = _forward(q_block, kv_block[0], kv_block[1], causal=False)
+                    RingAttention.ATTN_DONE.record()
+                    # Pipeline the next KV comm with output correction instead of the next flash attn
+                    # to minimize idle time when comm takes longer than attn.
+                    _kv_comm(i + 1)
+
+                    block_softmax_lse[i % 2] = (
+                        block_softmax_lse[i % 2].transpose(0, 1).unsqueeze(-1).contiguous().float()
+                    )  # (H, T) -> (T, H, 1)
+                    assert block_out[i % 2].shape[:-1] == block_softmax_lse[i % 2].shape[:-1]
+                    # Output and log sum exp correction. Ideally overlap this with the next flash attn kernel.
+                    # In reality this always finishes before next flash attn; no need for extra sync.
+                    if i == 0:
+                        out = block_out[0]
+                        softmax_lse = block_softmax_lse[0]
+                    elif i <= local_sp_rank:
+                        out, softmax_lse = _rescale_out_lse(
+                            out, block_out[i % 2], softmax_lse, block_softmax_lse[i % 2]
+                        )
+                    else:
+                        out[half_idx_back], softmax_lse[half_idx_back] = _rescale_out_lse(
+                            out[half_idx_back], block_out[i % 2], softmax_lse[half_idx_back], block_softmax_lse[i % 2]
+                        )
+
+            torch.cuda.current_stream().wait_stream(sp_stream)
+            return out, softmax_lse
+
+        def _other_ring_forward(ring_num_idx, out, softmax_lse):
+            # Loop through the inner ring after receiving
+            # all new KVs from the previous inner ring
+            for i in range(local_sp_size):
+                with torch.cuda.stream(sp_streams[i % 2]):
+                    # Send & recv KV
+                    if i > 0:
+                        local_kv_comms[(i + 1) % 2].wait()
+
+                    if i == 0:
+                        _kv_comm(i)
+
+                    if ring_num_idx > inter_ring_rank:
+                        kv_block = kv_buffers[i % 2]
+                        (
+                            block_out[i % 2],
+                            block_softmax_lse[i % 2],
+                            rng_states[i + local_sp_size * ring_num_idx],
+                        ) = _forward(q1, kv_block[0], kv_block[1], causal=False)
+                        RingAttention.ATTN_DONE.record()
+
+                        _kv_comm(i + 1)
+                        block_softmax_lse[i % 2] = (
+                            block_softmax_lse[i % 2].transpose(0, 1).unsqueeze(-1).contiguous().float()
+                        )
+                        out[half_idx_back], softmax_lse[half_idx_back] = _rescale_out_lse(
+                            out[half_idx_back], block_out[i % 2], softmax_lse[half_idx_back], block_softmax_lse[i % 2]
+                        )
+                    else:
+                        kv_block = kv_buffers[i % 2][:, half_idx_front]
+                        (
+                            block_out[i % 2],
+                            block_softmax_lse[i % 2],
+                            rng_states[i + local_sp_size * ring_num_idx],
+                        ) = _forward(q, kv_block[0], kv_block[1], causal=False)
+                        RingAttention.ATTN_DONE.record()
+
+                        _kv_comm(i + 1)
+                        block_softmax_lse[i % 2] = (
+                            block_softmax_lse[i % 2].transpose(0, 1).unsqueeze(-1).contiguous().float()
+                        )
+                        out, softmax_lse = _rescale_out_lse(
+                            out, block_out[i % 2], softmax_lse, block_softmax_lse[i % 2]
+                        )
+
+            torch.cuda.current_stream().wait_stream(sp_stream)
+            return out, softmax_lse
+
+        # Send and recv KV between rings at once to maximize NIC util.
+        inter_ring_kv = None
+        for ring_num_idx in range(num_rings):
+            if ring_num_idx > 0:
+                inter_ring_comm.wait()
+                # Reset indices
+                kv_buffers[0] = inter_ring_kv
+
+            if ring_num_idx < num_rings - 1:
+                if ring_num_idx == 0:
+                    to_send = kv_buffers[0]
+                else:
+                    # The last received KV
+                    to_send = kv_buffers[(local_sp_size - 1) % 2]
+                inter_ring_kv = inter_ring_comm.send_recv(to_send)
+
+            if ring_num_idx == 0:
+                out, softmax_lse = _local_ring_forward()
+            else:
+                out, softmax_lse = _other_ring_forward(ring_num_idx, out, softmax_lse)
+
+        out = out.to(q.dtype)
+        if not is_packed:
+            out = out.view(b, sq, h, d)
+            q, k, v = [x.view(b, sq, *x.shape[-2:]) for x in (q, k, v)]  # (T, H, D) -> (B, Sq, H, D)
+        softmax_lse = softmax_lse.squeeze(-1)
+
+        ctx.sp_group = sp_group
+        ctx.max_seqlen_q = ctx.max_seqlen_kv = max_seqlen
+        misc_kwargs["deterministic"] = deterministic
+        del misc_kwargs["return_softmax"]
+        ctx.misc_kwargs = misc_kwargs
+        ctx.is_packed = is_packed
+
+        ctx.kv_group = inner_ring_group
+        ctx.inter_kv_group = inter_ring_group
+
+        ctx.save_for_backward(
+            q,
+            k,
+            v,
+            out,
+            softmax_lse.transpose(0, 1).contiguous(),  # (T, H) -> (H, T)
+            cu_seqlens_q,
+            cu_seqlens_kv,
+            half_idx_front,
+            half_idx_back,
+            *rng_states,
+        )
+
+        if return_softmax:
+            return out, softmax_lse
+        return out, None
+
+    def backward(ctx, dout, _):
+        """
+        During backward, we accumulate q grads on each rank locally, but iterate kv and their grads
+        over all ranks for accumulation.
+        """
+        (q, k, v, out, softmax_lse, cu_seqlens_q, cu_seqlens_kv, half_idx_front, half_idx_back) = ctx.saved_tensors[:9]
+        rng_states = ctx.saved_tensors[9:]
+
+        is_packed = ctx.is_packed
+        max_seqlen_q = ctx.max_seqlen_q
+        max_seqlen_kv = ctx.max_seqlen_kv
+        cu_seqlens_half = cu_seqlens_q // 2
+        max_seqlen_half = max_seqlen_q // 2
+        misc_kwargs = ctx.misc_kwargs
+        del misc_kwargs["block_table"]
+
+        assert (
+            out.shape == dout.shape == q.shape
+        ), f"out {out.shape} and dout {dout.shape} should have the same shape ({q.shape})."
+
+        if is_packed:
+            t, h, d = q.shape
+        else:
+            b, sq, h, d = q.shape
+            t = b * sq
+        q, k, v, out, dout = [x.view(t, *x.shape[-2:]) for x in (q, k, v, out, dout)]
+
+        # Sequence parallel args
+        sp_group = ctx.sp_group
+        local_kv_group = ctx.kv_group
+        inter_kv_group = ctx.inter_kv_group
+
+        local_sp_rank = dist.get_rank(sp_group)
+        sp_size = dist.get_world_size(sp_group)
+        # Using separate streams (pg) for concurrent kv and dkv comm may
+        # cause NCCL "software caused connection abort" here...
+        local_kv_comm = RingComm(local_kv_group)
+        local_dkv_comm = RingComm(local_kv_group)
+        inter_kv_comm = RingComm(inter_kv_group)
+        inter_dkv_comm = RingComm(inter_kv_group)
+        local_sp_size = dist.get_world_size(local_kv_group)
+        local_sp_rank = dist.get_rank(local_kv_group)
+
+        if dist.get_world_size(inter_kv_group) != sp_size:
+            num_rings = dist.get_world_size(inter_kv_group)
+            inter_ring_rank = dist.get_rank(inter_kv_group)
+        else:
+            num_rings = 1
+            inter_ring_rank = 0
+
+        if local_sp_rank != sp_size - 1:
+            softmax_lse1 = softmax_lse[:, half_idx_back]
+        dout = dout.contiguous()
+
+        # Double comm buffers for sending and receiving kv
+        kv_buffers = [torch.stack((k, v))]  # (2, T, H, D)
+        kv_buffers.append(torch.empty_like(kv_buffers[0]))
+
+        dq = None  # (T, H, D)
+        # Intermediate outputs
+        dq_block = torch.empty_like(q)  # (T, H, D)
+        dk_block = torch.empty_like(k)  # (T, H, D)
+        dv_block = torch.empty_like(v)  # (T, H, D)
+        dkv_buffers = [torch.empty_like(kv, dtype=torch.float32) for kv in kv_buffers]  # (T, H, D)
+        del k, v
+
+        def _backward(dout, q, k, v, out, softmax_lse, dq, dk, dv, rng_state, causal):
+            _flash_attn_backward(
+                dout,
+                q,
+                k,
+                v,
+                out,
+                softmax_lse,
+                dq,
+                dk,
+                dv,
+                cu_seqlens_q if dq.shape[0] == t else cu_seqlens_half,
+                cu_seqlens_kv if dk.shape[0] == t else cu_seqlens_half,
+                max_seqlen_q if dq.shape[0] == t else max_seqlen_half,
+                max_seqlen_kv if dk.shape[0] == t else max_seqlen_half,
+                causal=causal,
+                rng_state=rng_state,
+                **misc_kwargs,
+            )
+
+        # NOTE: We avoid using two streams due to doubled buffers
+        # and that backward is more communication intensive.
+        def _local_ring_backward():
+            for i in range(local_sp_size):
+                if i > 0:
+                    local_kv_comm.wait()
+
+                if i < local_sp_size - 1:
+                    # Send kv to next rank for backward
+                    local_kv_comm.send_recv(kv_buffers[i % 2], kv_buffers[(i + 1) % 2])
+
+                if i == 0:
+                    # Backward with local kv
+                    k_, v_ = kv_buffers[i % 2]
+                    q_, dout_, out_ = q, dout, out
+                    dq_, dk_, dv_ = dq_block, dk_block, dv_block
+                    _backward(dout_, q_, k_, v_, out_, softmax_lse, dq_, dk_, dv_, rng_states[i], causal=True)
+
+                elif i <= local_sp_rank:
+                    # Drop the second half of kv
+                    # (T, H, D) -> (T // 2, H, D)
+                    k_, v_ = [x[half_idx_front] for x in kv_buffers[i % 2]]
+                    dk_, dv_ = [x[: t // 2] for x in (dk_block, dv_block)]
+                    dq_, q_, out_, dout_ = (dq_block, q, out, dout)
+                    _backward(dout_, q_, k_, v_, out_, softmax_lse, dq_, dk_, dv_, rng_states[i], causal=False)
+
+                else:
+                    # Drop the first half of q
+                    k_, v_ = kv_buffers[i % 2]
+                    dk_, dv_ = dk_block, dv_block
+                    q_, out_, dout_ = [x[half_idx_back] for x in (q, out, dout)]
+                    dq_ = dq_block[: t // 2]
+                    _backward(dout_, q_, k_, v_, out_, softmax_lse1, dq_, dk_, dv_, rng_states[i], causal=False)
+
+                # Accumulate grads
+                if i == 0:
+                    dq = dq_block.float()
+                    dkv_buffers[i % 2][0] = dk_block.float()
+                    dkv_buffers[i % 2][1] = dv_block.float()
+                else:
+                    # Accumulate local dq
+                    if i <= local_sp_rank:
+                        dq += dq_  # (T, H, D)
+                    else:
+                        dq[half_idx_back] += dq_
+
+                    # Wait for mobile kv grad accumulators
+                    local_dkv_comm.wait()
+
+                    if i <= local_sp_rank:
+                        # q blocks "surrounded" by kv blocks
+                        dkv_buffers[i % 2][0][half_idx_front] += dk_
+                        dkv_buffers[i % 2][1][half_idx_front] += dv_
+                    else:
+                        # q blocks "surrounding" kv blocks
+                        dkv_buffers[i % 2][0] += dk_
+                        dkv_buffers[i % 2][1] += dv_
+                local_dkv_comm.send_recv(send_tensor=dkv_buffers[i % 2], recv_tensor=dkv_buffers[(i + 1) % 2])
+
+            local_dkv_comm.wait()
+            dkv_recv = dkv_buffers[local_sp_size % 2]
+            dkv_send = dkv_buffers[(local_sp_size - 1) % 2]
+            return dq, dkv_recv, dkv_send
+
+        def _other_ring_backward(ring_num_idx, dq):
+            if ring_num_idx > inter_ring_rank:
+                # Indexing is expensive
+                q_, out_, dout_ = [x[half_idx_back] for x in (q, out, dout)]
+            else:
+                q_, out_, dout_ = (q, out, dout)
+
+            for i in range(local_sp_size):
+                if i > 0:
+                    local_kv_comm.wait()
+
+                if i < local_sp_size - 1:
+                    local_kv_comm.send_recv(kv_buffers[i % 2], kv_buffers[(i + 1) % 2])
+
+                rng_state = rng_states[i + local_sp_size * ring_num_idx]
+                if ring_num_idx > inter_ring_rank:
+                    k_, v_ = kv_buffers[i % 2]
+                    dk_, dv_ = dk_block, dv_block
+                    dq_ = dq_block[: t // 2]
+                    _backward(dout_, q_, k_, v_, out_, softmax_lse1, dq_, dk_, dv_, rng_state, causal=False)
+
+                    dq[half_idx_back] += dq_
+                    if i > 0:
+                        local_dkv_comm.wait()
+                    else:
+                        inter_dkv_comm.wait()
+
+                    dkv_buffers[i % 2][0] += dk_
+                    dkv_buffers[i % 2][1] += dv_
+                else:
+                    k_, v_ = [x[half_idx_front] for x in kv_buffers[i % 2]]
+                    dk_, dv_ = [x[: t // 2] for x in (dk_block, dv_block)]
+                    dq_ = dq_block
+                    _backward(dout_, q_, k_, v_, out_, softmax_lse, dq_, dk_, dv_, rng_state, causal=False)
+
+                    dq += dq_
+                    if i > 0:
+                        local_dkv_comm.wait()
+                    else:
+                        inter_dkv_comm.wait()
+
+                    dkv_buffers[i % 2][0][half_idx_front] += dk_
+                    dkv_buffers[i % 2][1][half_idx_front] += dv_
+
+                local_dkv_comm.send_recv(send_tensor=dkv_buffers[i % 2], recv_tensor=dkv_buffers[(i + 1) % 2])
+
+            local_dkv_comm.wait()
+            dkv_recv = dkv_buffers[local_sp_size % 2]
+            dkv_send = dkv_buffers[(local_sp_size - 1) % 2]
+            return dq, dkv_recv, dkv_send
+
+        inter_ring_kv = None
+        for ring_num_idx in range(num_rings):
+            if ring_num_idx > 0:
+                inter_kv_comm.wait()
+                kv_buffers[0] = inter_ring_kv
+
+            if ring_num_idx < num_rings - 1:
+                # Re-allocate a buffer in each inter-ring step
+                inter_ring_kv = inter_kv_comm.send_recv(kv_buffers[0])
+
+            if ring_num_idx == 0:
+                dq, dkv_recv, dkv_send = _local_ring_backward()
+            else:
+                dq, dkv_recv, dkv_send = _other_ring_backward(ring_num_idx, dq)
+
+            if num_rings > 1:
+                # Reuse the local buffers
+                inter_dkv_comm.send_recv(send_tensor=dkv_recv, recv_tensor=dkv_send)
+                # Reset indices
+                dkv_buffers[0] = dkv_send
+                dkv_buffers[1] = dkv_recv
+                if ring_num_idx == num_rings - 1:
+                    inter_dkv_comm.wait()
+                    dkv_recv = dkv_buffers[0]
+
+        dq, dk, dv = [x.to(q.dtype) for x in (dq, *dkv_recv)]
+        if not is_packed:
+            dq, dk, dv = [x.view(b, sq, *x.shape[-2:]) for x in (dq, dk, dv)]
+
+        return (dq, dk, dv, None, None, None, None, None, None, None, None, None, None, None)
+
+    @staticmethod
+    def prepare_varlen_batch(
+        attention_mask: torch.Tensor,
+        sp_group: dist.ProcessGroup,
+        inputs_embeds: torch.Tensor = None,
+        position_ids: Optional[torch.Tensor] = None,
+        is_label: bool = False,
+        is_2d: bool = True,
+    ):
+        """
+        Preprocess a batch of padded sequence by splitting input sequence by sp_size
+        sequence-wise and packing them into one sequence. Updates the mask info accordingly.
+        Args:
+            attention_mask (torch.Tensor): Contains the mask [B, Sq], where True means the token is NOT masked.
+            sp_group (dist.ProcessGroup): Process group for sequence parallelism
+            inputs_embeds (torch.Tensor): Input embeddings. Shape should be [B, Sq, ...]
+            position_ids (Optional[torch.Tensor], optional): Position ids of shape [Sq] or [1, Sq]. Defaults to None.
+            is_label (bool, optional): Whether inputs_embeds is instead a label tensor. If True, mask out the first
+                token of each sequence.
+            is_2d (bool, optional): Whether to return 2D outputs padded to max_seqlen // sp_size or flatten
+                the batch dim to a packed 1d sequence. Contingent on model forward shape definitions.
+
+        Returns:
+            inputs_embeds: Packed input embeddings of shape [B, Sq // sp_size, ...].
+            mask_info: A dictionary of mask info.
+            position_ids: Packed position ids of shape [..., Sq // sp_size].
+
+        """
+        _load_varlen_helpers()
+        sp_size = dist.get_world_size(group=sp_group)
+        sp_rank = dist.get_rank(group=sp_group)
+        mask_info = {}
+        mask_info["max_seqlen"], mask_info["cu_seqlens"] = get_pad_info(attention_mask, return_indices=False)
+
+        # Unpad, split seq-wise, then pad back to (B, max_seqlen // sp_size)
+        # Split mask to compute local nonzero position indices
+        # (B, Sq) -> (B, max_seqlen // sp_size)
+        attention_mask = attention_mask[:, : mask_info["max_seqlen"]]
+        if inputs_embeds is not None:
+            inputs_embeds = inputs_embeds[:, : mask_info["max_seqlen"]]
+            inputs_embeds = split_varlen_zigzag(
+                inputs_embeds,
+                mask_info["cu_seqlens"],
+                sp_group,
+                mask_info["max_seqlen"],
+                is_2d=is_2d,
+                is_label=is_label,
+            )
+        attention_mask = split_varlen_zigzag(
+            attention_mask, mask_info["cu_seqlens"], sp_group, mask_info["max_seqlen"], is_2d=is_2d
+        )
+
+        if position_ids is not None:
+            indices = torch.tensor([sp_rank, 2 * sp_size - sp_rank - 1], device=inputs_embeds.device)
+            position_ids = (
+                position_ids[..., : mask_info["max_seqlen"]]  # unpad
+                .view(-1, sp_size * 2, mask_info["max_seqlen"] // (sp_size * 2))
+                .index_select(-2, indices)
+                .view(-1, mask_info["max_seqlen"] // sp_size)
+            )
+
+        mask_info["max_seqlen"] //= sp_size
+        mask_info["valid_indices"] = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+        mask_info["cu_seqlens"] //= sp_size
+        mask_info["attention_mask_type"] = AttnMaskType.PADDED_CAUSAL
+        return inputs_embeds, mask_info, position_ids
diff --git a/colossalai/shardformer/layer/linear.py b/colossalai/shardformer/layer/linear.py
index 4e5ebef0dcb6..d77dd496592f 100644
--- a/colossalai/shardformer/layer/linear.py
+++ b/colossalai/shardformer/layer/linear.py
@@ -202,19 +202,21 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
         # Matrix multiply.
         bias = self.bias if not self.skip_bias_add else None
 
-        if self.seq_parallel_mode is None:
-            output_parallel = linear_with_async_comm(
-                input_parallel, self.weight, bias, self.process_group, True, fp8_communication=self.fp8_communication
-            )
-        elif self.seq_parallel_mode == "split_gather":
+        if self.seq_parallel_mode == "split_gather":
             input_parallel = gather_forward_reducescatter_backward(
                 input_parallel, self.process_group, self.seq_parallel_dim, fp8_communication=self.fp8_communication
             )
-            output_parallel = linear_with_async_comm(input_parallel, self.weight, bias, self.process_group, False)
+            output_parallel = linear_with_async_comm(
+                input_parallel, self.weight, bias, self.process_group, False, fp8_communication=self.fp8_communication
+            )
         elif self.seq_parallel_mode == "ring":
             output_parallel = linear_gather_forward_reducescatter_backward(
                 input_parallel, self.weight, bias, self.process_group, True, self.seq_parallel_dim, self.overlap, True
             )
+        else:
+            output_parallel = linear_with_async_comm(
+                input_parallel, self.weight, bias, self.process_group, True, fp8_communication=self.fp8_communication
+            )
 
         if self.gather_output:
             # All-gather across the partitions.
@@ -442,6 +444,9 @@ def forward(self, input_: Tensor) -> Tensor:
                     dim=self.seq_parallel_dim,
                     ring=True,
                 )
+            else:
+                output_parallel = linear_with_async_comm(input_, self.weight, None, self.process_group, False)
+                output = reduce_forward(output_parallel, self.process_group)
 
         if not self.skip_bias_add:
             if self.bias is not None:
diff --git a/colossalai/shardformer/layer/loss.py b/colossalai/shardformer/layer/loss.py
index cea2da03fb58..12df824d1c0c 100644
--- a/colossalai/shardformer/layer/loss.py
+++ b/colossalai/shardformer/layer/loss.py
@@ -4,10 +4,15 @@
 from torch.distributed import ProcessGroup
 from torch.nn import CrossEntropyLoss
 
+from colossalai.shardformer.layer._operation import reduce_forward
 from colossalai.shardformer.shard import ShardConfig
 
+from .utils import is_share_sp_tp
+
 __all__ = ["DistCrossEntropy", "cross_entropy_1d", "dist_cross_entropy"]
 
+_IGNORE_IDX = -100
+
 
 class DistCrossEntropy(Function):
     r"""
@@ -26,11 +31,12 @@ def forward(
         process_group: ProcessGroup,
         vocab_size: int,
         dtype=torch.float32,
+        mode="mean",
     ):
         r"""
         Calculate the cross entropy loss before gather, the origin loss function is as follows:
         loss = -log(exp(x[class])/sum(exp(x[i]))
-        and can be rewrite as:
+        and can be rewriten as:
         loss = log(sum(exp(x[i])) - x[class]
 
         To avoid the `nan` of log(sum(exp(x[i]))), we minus the max of x[i]
@@ -44,12 +50,10 @@ def forward(
         Returns:
             :class:`torch.Tensor`: The cross entropy loss
         """
+        assert mode in ["mean", "sum"]
         # get the max
         logits_max = torch.max(vocab_logits, dim=-1)[0]
-        dist.all_reduce(logits_max, op=dist.ReduceOp.MAX, group=process_group)
-
-        # minus the max to avoid the result of sum of exp is too large and the log is nan
-        vocab_logits = vocab_logits - logits_max.unsqueeze(dim=-1)
+        handle = dist.all_reduce(logits_max, op=dist.ReduceOp.MAX, group=process_group, async_op=True)
 
         # mask the target in the local device
         rank = dist.get_rank(group=process_group)
@@ -70,24 +74,25 @@ def forward(
         mask = (target < down_threshold) | (target >= up_threshold)
         masked_target = target.clone() - down_threshold
         masked_target[mask] = 0
+        masked_target_1d = masked_target.view(-1).contiguous()
 
+        # minus the max to avoid the result of sum of exp is too large and the log is nan
+        handle.wait()
+        vocab_logits = vocab_logits - logits_max.unsqueeze(dim=-1)
         # reshape the logits and target
         # reshape the vocab_logits to [bath_size * seq_len, vocab_size]
         # reshape the labels to [bath_size * seq_len]
         self_vocab_size = vocab_logits.size()[-1]
         logits_2d = vocab_logits.view(-1, self_vocab_size)
-        masked_target_1d = masked_target.view(-1)
 
         # extract the x[class] and set the x[other device] to zero
-        pred_logits_1d = logits_2d[
-            torch.arange(start=0, end=logits_2d.shape[0], device=logits_2d.device), masked_target_1d
-        ]
-        pred_logits_1d = pred_logits_1d.clone().contiguous()
+        idx = torch.arange(start=0, end=logits_2d.shape[0], device=logits_2d.device)
+        pred_logits_1d = logits_2d[idx, masked_target_1d].contiguous()
         pred_logits = pred_logits_1d.view_as(target)
         pred_logits[mask] = 0.0
 
-        # allreduce the get all x(i,y)
-        dist.all_reduce(pred_logits, op=dist.ReduceOp.SUM, group=process_group)
+        # all-reduce to get full x[i, y]
+        handle = dist.all_reduce(pred_logits, op=dist.ReduceOp.SUM, group=process_group, async_op=True)
         exp_logits = vocab_logits
         torch.exp(vocab_logits, out=exp_logits)
         sum_exp_logits = torch.sum(exp_logits, dim=-1, dtype=torch.float32)
@@ -95,23 +100,29 @@ def forward(
 
         # calculate the loss
         # loss = log(sum(exp(x[i]))) - x[class]
+        handle.wait()
         loss = torch.where(target == ignore_index, 0.0, torch.log(sum_exp_logits) - pred_logits)
-        num_non_zero = torch.sum(loss != 0.0)
-        ctx.inv_num_non_zero = 1.0 / num_non_zero
-        loss = torch.sum(loss).div_(num_non_zero)
+        if mode == "mean":
+            num_non_zero = torch.sum(loss != 0.0)
+            ctx.inv_num_non_zero = 1.0 / num_non_zero
+            loss = torch.sum(loss).div_(num_non_zero)
+        else:
+            loss = torch.sum(loss)
 
         # calculate the softmax
         exp_logits = exp_logits.div(sum_exp_logits.unsqueeze(dim=-1)).to(dtype)
         exp_logits[target == ignore_index] = 0.0
         ctx.save_for_backward(exp_logits, mask, masked_target_1d)
         ctx.dtype = dtype
+        ctx.mode = mode
 
         return loss
 
     @staticmethod
     def backward(ctx, grad_output):
         # retrieve the saved tensors
-        grad_output = grad_output * ctx.inv_num_non_zero
+        if ctx.mode == "mean":
+            grad_output = grad_output * ctx.inv_num_non_zero
         exp_logits, mask, masked_target_1d = ctx.saved_tensors
 
         # use exp logits as the input grad
@@ -123,55 +134,113 @@ def backward(ctx, grad_output):
         grad_logits_2d[torch.arange(0, grad_logits_2d.shape[0]), masked_target_1d] -= update
 
         grad_logits.mul_(grad_output.unsqueeze(dim=-1))
-        return grad_logits, None, None, None, None, None
+        return grad_logits, None, None, None, None, None, None
 
 
 def cross_entropy_1d(
     vocab_logits: torch.Tensor,
     labels: torch.Tensor,
-    ignore_index: int = -100,
+    ignore_index: int = _IGNORE_IDX,
     process_group: ProcessGroup = None,
     vocab_size: int = None,
     dtype: torch.dtype = None,
+    mode: str = "mean",
 ) -> torch.Tensor:
-    return DistCrossEntropy.apply(vocab_logits, labels, ignore_index, process_group, vocab_size, dtype)
+    return DistCrossEntropy.apply(vocab_logits, labels, ignore_index, process_group, vocab_size, dtype, mode)
 
 
 def dist_cross_entropy(
-    labels: torch.Tensor,
-    logits: torch.Tensor,
+    labels: torch.Tensor,  # [B, S] or [B, S, Vocab_size]
+    logits: torch.Tensor,  # [B, S, Vocab_size]
     shard_config: ShardConfig,
     out_features: int,
     vocab_size: int,
     dtype: torch.dtype,
+    seq_dim: int = 1,
 ) -> torch.Tensor:
     """
-    Helper to compute cross entropy loss for most shardformer models,
-    compatible with PP, TP and SP.
+    Helper to compute cross entropy loss for most shardformer models supporting PP, TP and SP.
     """
-    if labels is not None:
-        # Shift so that tokens < n predict n
-        shift_logits = logits[..., :-1, :].contiguous()
-        shift_labels = labels[..., 1:].contiguous()
-        # Flatten the tokens
-        loss_fct = CrossEntropyLoss()
-        shift_labels = shift_labels.view(-1)
-        shift_labels = shift_labels.to(shift_logits.device)
-        if shard_config.enable_tensor_parallelism and shard_config.parallel_output:
-            # Cross entropy with all-reduce for TP
-            new_vocab_size = logits.shape[-1]
-            shift_logits = shift_logits.view(-1, new_vocab_size)
-            loss = cross_entropy_1d(
-                shift_logits,
-                shift_labels,
-                process_group=shard_config.tensor_parallel_process_group,
-                vocab_size=out_features,
-                dtype=dtype,
-            )
-        else:
-            # NOTE if use TP and not parallel_output, the output is gathered.
-            # see VocabParallelLMHead1D
-            shift_logits = shift_logits.view(-1, vocab_size)
-            loss = loss_fct(shift_logits, shift_labels)
-
-        return loss
+    # Split labels if not gather output
+    sp_group = shard_config.sequence_parallel_process_group
+    sp_rank = dist.get_rank(sp_group)
+    sp_size = shard_config.sequence_parallel_size
+    sp_mode = shard_config.sequence_parallelism_mode
+    parallel_output = shard_config.parallel_output
+    is_tp = shard_config.enable_tensor_parallelism
+    is_packed = labels.dim() == 2
+    if is_packed:
+        bs, seq_len = labels.shape
+    else:
+        # padded sequence
+        seq_len = labels.shape[-1]
+        logits = logits.reshape(-1, *logits.shape[2:])
+        seq_dim = 0
+
+    # Shift labels to predict the next token, and remove the tail logit predicting <EOS>
+    is_sp = sp_size > 1 and (not is_share_sp_tp(sp_mode))
+    split_labels_here = seq_len // sp_size == logits.size(seq_dim)  # ring attn splits labels before forward
+
+    if sp_mode == "ring_attn":
+        # For Zigzag Ring Attention, labels should've been split and
+        # shifted by RingAttention.prepare_varlen_batch()
+        if sp_rank == 0:
+            logits = logits[..., :-1, :]
+            logits = torch.cat([logits, torch.full_like(logits[:, :1, :], _IGNORE_IDX)], dim=seq_dim)
+    elif is_sp:
+        # Shift only once: either before splitting or in the last rank without splitting
+        if split_labels_here or (sp_rank == sp_size - 1):
+            labels = labels[..., 1:]
+        if split_labels_here:
+            labels = labels.split(seq_len // sp_size, dim=-1)[sp_rank]
+
+        if sp_rank == sp_size - 1:
+            logits = logits[..., :-1, :]
+            # Pad logits and labels to the same shape across all ranks for TP all_reduce
+            if is_tp and parallel_output:
+                # If is packed sequence (label dim is 1), then each seq already has the end label token padded.
+                # torch.cat is faster than F.pad...
+                pad_shape = (logits.shape[0], 1, *logits.shape[2:]) if is_packed else (1, *logits.shape[1:])
+                padding = torch.full(pad_shape, _IGNORE_IDX, dtype=logits.dtype, device=logits.device)
+                logits = torch.cat([logits, padding], dim=seq_dim)
+                pad_shape = (labels.shape[0], 1) if is_packed else (1,)
+                padding = torch.full(pad_shape, _IGNORE_IDX, dtype=labels.dtype, device=labels.device)
+                labels = torch.cat([labels, padding], dim=seq_dim)
+    else:
+        labels = labels[..., 1:]
+        logits = logits[..., :-1, :]
+    labels = labels.contiguous()
+    logits = logits.contiguous()
+    num_nonzero = (labels != _IGNORE_IDX).sum()
+    assert labels.shape == logits.shape[:-1], f"label shape {labels.shape} does not match logit shape {logits.shape}"
+
+    # Flatten the tokens
+    loss_fct = CrossEntropyLoss(ignore_index=_IGNORE_IDX, reduction="sum")
+    labels = labels.view(-1)
+
+    if is_tp and parallel_output:
+        # Cross entropy with all-reduce for TP
+        new_vocab_size = logits.shape[-1]
+        logits = logits.view(-1, new_vocab_size)
+        loss = cross_entropy_1d(
+            logits,
+            labels,
+            process_group=shard_config.tensor_parallel_process_group,
+            vocab_size=out_features,
+            dtype=dtype,
+            mode="sum",
+        )
+    else:
+        # NOTE if use TP and not parallel_output, the output is gathered in VocabParallelLMHead1D
+        logits = logits.view(-1, vocab_size)
+        loss = loss_fct(logits, labels)
+
+    # Reduce loss instead of gathering logits over seq dim for savings
+    if split_labels_here or sp_mode == "ring_attn":
+        # Get the global non-zero count
+        loss = torch.stack((loss, num_nonzero))
+        # Rescale to offset the grad / (DP * SP) in HybridParallelPlugin
+        loss = reduce_forward(loss, sp_group, grad_scale=sp_size)
+        loss, num_nonzero = loss[0], loss[1].detach()
+    loss = (loss / num_nonzero).squeeze()
+    return loss
diff --git a/colossalai/shardformer/layer/normalization.py b/colossalai/shardformer/layer/normalization.py
index 59e1da9fc58f..043bf6aeb4cd 100644
--- a/colossalai/shardformer/layer/normalization.py
+++ b/colossalai/shardformer/layer/normalization.py
@@ -42,7 +42,7 @@ def forward(self, input):
             return output
 
 except ImportError:
-    warnings.warn("Please install apex from source (https://github.com/NVIDIA/apex) to use the fused layernorm kernel")
+    warnings.warn("Please install apex from source (https://github.com/NVIDIA/apex) to use the fused RMSNorm kernel")
 
 FAST_LAYERNORM_SUPPORTED_SIZE = [
     1024,
@@ -270,12 +270,6 @@ def from_native_module(module: nn.Module, sp_partial_derived: bool = False, *arg
         Returns:
             nn.Module: FusedRMSNorm module.
         """
-        try:
-            pass
-        except ImportError:
-            raise ImportError(
-                "Please install apex from source (https://github.com/NVIDIA/apex) to use the fused RMS normalization kernel"
-            )
 
         LazyInitContext.materialize(module)
 
@@ -284,11 +278,18 @@ def from_native_module(module: nn.Module, sp_partial_derived: bool = False, *arg
         eps = module.variance_epsilon if hasattr(module, "variance_epsilon") else module.eps
         elementwise_affine = getattr(module, "elementwise_affine", True)
 
-        rmsnorm = FusedRMSNormWithHook(
-            normalized_shape=normalized_shape,
-            eps=eps,
-            elementwise_affine=elementwise_affine,
-        )
+        try:
+            rmsnorm = FusedRMSNormWithHook(
+                normalized_shape=normalized_shape,
+                eps=eps,
+                elementwise_affine=elementwise_affine,
+            )
+        except ImportError:
+            warnings.warn(
+                "Module replacement failed.\
+                Please install apex from source (https://github.com/NVIDIA/apex) to use the fused RMS normalization kernel"
+            )
+            return module
 
         rmsnorm.weight = module.weight
 
diff --git a/colossalai/shardformer/layer/qkv_fused_linear.py b/colossalai/shardformer/layer/qkv_fused_linear.py
index 561867993402..f9a41a467300 100644
--- a/colossalai/shardformer/layer/qkv_fused_linear.py
+++ b/colossalai/shardformer/layer/qkv_fused_linear.py
@@ -555,7 +555,7 @@ def forward(self, input_: Tensor) -> Tensor:
         else:
             if self.seq_parallel_mode is None:
                 output_parallel = torch.matmul(input_, self.weight)
-                output = reduce_forward(output_parallel, self.process_group, self.fp8_communication)
+                output = reduce_forward(output_parallel, self.process_group, fp8_communication=self.fp8_communication)
             elif self.seq_parallel_mode == "split_gather":
                 output_parallel = torch.matmul(input_, self.weight)
                 output = reducescatter_forward_gather_backward(
diff --git a/colossalai/shardformer/layer/utils.py b/colossalai/shardformer/layer/utils.py
index 9c6ced4454dc..c1a73ce05c97 100644
--- a/colossalai/shardformer/layer/utils.py
+++ b/colossalai/shardformer/layer/utils.py
@@ -1,5 +1,5 @@
 from contextlib import contextmanager
-from typing import List
+from typing import List, Optional, Union
 
 import torch
 import torch.distributed as dist
@@ -289,3 +289,199 @@ def create_randomizer_with_offset(
         Randomizer.increment_index()
 
     return Randomizer(seed=base_seed)
+
+
+def split_batch_zigzag(
+    batch: Union[torch.Tensor, List[torch.Tensor]], sp_group: ProcessGroup, seq_dim: int = 1, is_label: bool = False
+) -> Union[torch.Tensor, List[torch.Tensor]]:
+    """
+    Split the input along the sequence dimension for Ring Attention. Naively spliting the attention mask
+    in the causal setting will result in the preceding ranks having much less workload.
+    We split after "folding" the 2D attention mask in half (https://github.com/zhuzilin/ring-flash-attention/issues/2).
+    For example, for sp_size = 4 and seq_len = 8, we get | s0, s7 | s1, s6 | s2, s5 | s3, s4 |.
+
+    Args:
+        batch (List[torch.Tensor] or Tensor): The input tensor(s) to split.
+        sp_group (ProcessGroup): The process group for sequence parallelism.
+        seq_dim (int): The sequence dimension to split.
+        is_label (bool): If True, mask and shift the tensor for next token prediction.
+
+    """
+    sp_size = dist.get_world_size(sp_group)
+    sp_rank = dist.get_rank(sp_group)
+    if isinstance(batch, torch.Tensor):
+        batch = [batch]
+    seq_dim = seq_dim if seq_dim != -1 else batch[0].dim() - 1
+
+    if sp_size > 1:
+        for idx, tensor in enumerate(batch):
+            assert (
+                tensor.shape[seq_dim] // (sp_size * 2) > 1 and tensor.shape[seq_dim] % (sp_size * 2) == 0
+            ), f"Bro, the seq length {tensor.shape[seq_dim]} for tensor {idx} can't be split by {sp_size * 2}!"
+            if is_label:
+                assert tensor.dim() == 2, "Label shape should be (B, Seqlen)"
+                tensor = torch.cat([tensor[:, 1:], torch.full_like(tensor[:, :1], -100)], dim=1)
+
+            tensor = tensor.view(
+                *tensor.shape[:seq_dim],
+                2 * sp_size,
+                tensor.shape[seq_dim] // (2 * sp_size),
+                *tensor.shape[seq_dim + 1 :],
+            )
+            indices = torch.tensor([sp_rank, 2 * sp_size - 1 - sp_rank], device=tensor.device)
+            tensor = tensor.index_select(seq_dim, indices).contiguous()
+            # (B, 2, Sq // (2 * sp_size), ...) -> (B, Sq // sp_size, ...)
+            batch[idx] = tensor.view(*tensor.shape[:seq_dim], -1, *tensor.shape[seq_dim + 2 :])
+
+    if len(batch) == 1:
+        return batch[0]
+    return batch
+
+
+def split_varlen_zigzag(
+    batch: Union[List[torch.Tensor], torch.Tensor],
+    cu_seqlens: torch.Tensor,
+    sp_group: ProcessGroup,
+    max_seqlen: int = 0,
+    is_2d: bool = False,
+    is_label: bool = False,
+) -> Union[List[torch.Tensor], torch.Tensor]:
+    """Split each sequence in a batch of packed sequences in a zigzag fashion.
+        For each tensor in batch, return packed sequences if is_2d is False;
+        else return a padded batch of sequences.
+
+    Args:
+        batch (List[torch.Tensor]): Packed sequences of shape (B * Sq, ...), or (B, Sq, ...) if is_2d.
+        cu_seqlens (torch.Tensor): Cumulative sequence lengths of shape (B + 1) before splitting.
+        sp_group (ProcessGroup): The process group for sequence parallelism.
+        max_seqlen (int): The maximum sequence length in the batch before splitting.
+        is_2d (bool): If True, then input has batch size and sequence length split into two dimensions.
+        is_label (bool): If True, mask out the first token in each sequence (<Start of Sentence>).
+
+    Returns:
+        batch (List[torch.Tensor]): Packed sequences of shape (B * max_seqlen // sp_size)
+            or (B, max_seqlen // sp_size, ...) if is_2d
+    """
+    sp_size = dist.get_world_size(sp_group)
+    sp_rank = dist.get_rank(sp_group)
+    if is_2d:
+        assert max_seqlen > 0, "max_seqlen must be provided for 2D input"
+
+    if isinstance(batch, torch.Tensor):
+        batch = [batch]
+    for i, packed_seq in enumerate(batch):
+        device = packed_seq.device
+        dtype = packed_seq.dtype
+
+        if is_2d:
+            assert max_seqlen % (sp_size * 2) == 0
+            # Recreate a padded tensor with the new max seqlen
+            shape = (packed_seq.shape[0], max_seqlen // sp_size, *packed_seq.shape[2:])
+            local_seq = torch.zeros(shape, dtype=dtype, device=device)
+        else:
+            total_seqlen = cu_seqlens[-1]
+            assert (
+                total_seqlen % (2 * sp_size) == 0
+            ), f"total_seqlen {total_seqlen} must be divisible by 2 * sp_size = {2 * sp_size}"
+            local_seq = []
+
+        for j in range(len(cu_seqlens) - 1):
+            start, end = cu_seqlens[j], cu_seqlens[j + 1]
+            seqlen = end - start
+            assert (
+                seqlen % (2 * sp_size) == 0
+            ), f"batch {i} seq {j}'s length ({seqlen}) must be divisible by 2 * sp_size = {2 * sp_size} for splitting"
+
+            if is_2d:
+                seq = packed_seq[j][:seqlen]
+                if is_label:
+                    # Shift one position to the right for next token prediction
+                    seq = torch.cat([seq[1:], torch.tensor([-100], dtype=dtype, device=device)])
+
+                seq = seq.chunk(2 * sp_size, dim=0)
+                half = seqlen // sp_size // 2
+                local_seq[j][:half] = seq[sp_rank]
+                local_seq[j][half : seqlen // sp_size] = seq[2 * sp_size - 1 - sp_rank]
+            else:
+                seq = packed_seq[start:end]
+                if is_label:
+                    seq = torch.cat(seq[1:], torch.tensor([-100], dtype=dtype, device=device))
+                seq = seq.chunk(sp_size * 2)
+                local_seq.extend([seq[sp_rank], seq[2 * sp_size - 1 - sp_rank]])
+
+        if is_2d:
+            batch[i] = local_seq.contiguous()
+        else:
+            batch[i] = torch.cat(local_seq, dim=0)
+
+    if len(batch) == 1:
+        batch = batch[0]
+    return batch
+
+
+def is_share_sp_tp(sp_mode: str):
+    """sp_mode "ring" and "split_gather" use the TP group as SP group
+    to split both the vocab and sequence, so we must gather the sequence
+    to correctly get logits at each positions.
+    """
+    return sp_mode in ["ring", "split_gather"]
+
+
+class RingComm:
+    def __init__(self, process_group: dist.ProcessGroup):
+        self._process_group = process_group
+        self._ops = []
+        self.rank = dist.get_rank(self._process_group)
+        self.world_size = dist.get_world_size(self._process_group)
+        self._reqs = []
+
+        self.send_rank = (self.rank + 1) % self.world_size
+        self.recv_rank = (self.rank - 1) % self.world_size
+
+        self.send_rank = dist.get_global_rank(self._process_group, self.send_rank)
+        self.recv_rank = dist.get_global_rank(self._process_group, self.recv_rank)
+
+    def send_recv(
+        self,
+        send_tensor: torch.Tensor,
+        recv_tensor: Optional[torch.Tensor] = None,
+        commit: bool = True,
+    ) -> torch.Tensor:
+        if recv_tensor is None:
+            res = torch.empty_like(send_tensor)
+        else:
+            res = recv_tensor
+
+        # looks like batch_isend_irecv doesn't deadlock even
+        # when we don't swap send recv ops based on rank
+        send_op = dist.P2POp(dist.isend, send_tensor, self.send_rank, group=self._process_group)
+        recv_op = dist.P2POp(dist.irecv, res, self.recv_rank, group=self._process_group)
+        self._ops.extend([send_op, recv_op])
+
+        if commit:
+            self._reqs = dist.batch_isend_irecv(self._ops)
+        return res
+
+    def commit(self):
+        assert len(self._ops) > 0, "No ops to commit"
+        self._reqs = dist.batch_isend_irecv(self._ops)
+
+    def wait(self):
+        assert len(self._reqs) > 0, "No requests to wait for"
+        for req in self._reqs:
+            req.wait()
+        self._reqs = []
+        self._ops = []
+
+
+@torch.jit.script
+def get_half_index(cu_seqlens, *, front: bool):
+    index = torch.zeros(cu_seqlens[-1], dtype=torch.bool, device=cu_seqlens.device)
+    for i in range(len(cu_seqlens) - 1):
+        start, end = cu_seqlens[i], cu_seqlens[i + 1]
+        if front:
+            end = (start + end) // 2
+        else:
+            start = (start + end) // 2
+        index[start:end] = True
+    return index
diff --git a/colossalai/shardformer/modeling/chatglm2.py b/colossalai/shardformer/modeling/chatglm2.py
index 5fd7b6461f7e..a761968af009 100644
--- a/colossalai/shardformer/modeling/chatglm2.py
+++ b/colossalai/shardformer/modeling/chatglm2.py
@@ -216,6 +216,13 @@ def chatglm_model_forward(
                     grad_scale=1 / shard_config.sequence_parallel_size,
                     fp8_communication=shard_config.fp8_communication,
                 )
+            elif shard_config.sequence_parallelism_mode == "all_to_all":
+                hidden_states = split_forward_gather_backward(
+                    hidden_states,
+                    dim=0,
+                    process_group=shard_config.sequence_parallel_process_group,
+                    grad_scale=1 / shard_config.sequence_parallel_size,
+                )
         for idx in range(start_idx, end_idx):
             layer = self.encoder._get_layer(idx)
             if output_hidden_states:
@@ -257,6 +264,13 @@ def chatglm_model_forward(
                     grad_scale=shard_config.sequence_parallel_size,
                     fp8_communication=shard_config.fp8_communication,
                 )
+            elif shard_config.sequence_parallelism_mode == "all_to_all":
+                hidden_states = gather_forward_split_backward(
+                    hidden_states,
+                    dim=0,
+                    process_group=shard_config.sequence_parallel_process_group,
+                    grad_scale=shard_config.sequence_parallel_size,
+                )
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
         if stage_manager.is_last_stage():
@@ -405,6 +419,12 @@ def forward(
             rotary_pos_emb = rotary_pos_emb[None, :seq_length]
         rotary_pos_emb = rotary_pos_emb.transpose(0, 1).contiguous()
 
+        if sp_mode in ["all_to_all"] and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with sp mode `{sp_mode}`. Setting `use_cache=False`..."
+                )
+                use_cache = False
         if sp_mode in ["all_to_all"] and self.training:
             if use_cache:
                 logger.warning_once(
diff --git a/colossalai/shardformer/modeling/command.py b/colossalai/shardformer/modeling/command.py
index 1ece0c118d0c..5303383945bc 100644
--- a/colossalai/shardformer/modeling/command.py
+++ b/colossalai/shardformer/modeling/command.py
@@ -26,6 +26,8 @@
 
 from ..layer import ColoAttention, dist_cross_entropy
 
+_SUPPORTED_SP_MODE = ["all_to_all", "split_gather", "ring", "ring_attn"]
+
 
 class CommandPipelineForwards:
     """
@@ -353,7 +355,7 @@ def command_for_causal_lm_forward(
             return {"hidden_states": hidden_states}
 
 
-def get_command_flash_attention_forward(shard_config, sp_mode=None, sp_size=None, sp_group=None):
+def get_command_flash_attention_forward(shard_config: ShardConfig, sp_mode=None, sp_size=None, sp_group=None):
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -366,7 +368,7 @@ def forward(
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
         if sp_mode is not None:
-            assert sp_mode in ["all_to_all", "split_gather", "ring"], "Invalid sp_mode"
+            assert sp_mode in _SUPPORTED_SP_MODE, f"SP mode {sp_mode} is not supported by {type(self)} yet"
             assert (sp_size is not None) and (
                 sp_group is not None
             ), "Must specify sp_size and sp_group for sequence parallel"
@@ -465,7 +467,7 @@ def forward(
     return forward
 
 
-def get_command_flash_attention_model_forward(shard_config, sp_mode=None, sp_size=None, sp_group=None):
+def get_command_flash_attention_model_forward(shard_config: ShardConfig, sp_mode=None, sp_size=None, sp_group=None):
     logger = logging.get_logger(__name__)
 
     def forward(
diff --git a/colossalai/shardformer/modeling/deepseek.py b/colossalai/shardformer/modeling/deepseek.py
index 48a629705a7c..7ec390d6ab98 100644
--- a/colossalai/shardformer/modeling/deepseek.py
+++ b/colossalai/shardformer/modeling/deepseek.py
@@ -145,7 +145,11 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         output_split_sizes = torch.zeros_like(input_split_sizes)
 
         # [n0, n1, n2, n3] [m0, m1, m2, m3] -> [n0, n1, m0, m1] [n2, n3, m2, m3]
-        dist.all_to_all_single(output_split_sizes, input_split_sizes, group=self.ep_group)
+        dist.all_to_all_single(
+            output_split_sizes,
+            input_split_sizes,
+            group=self.ep_group,
+        )
 
         with torch.no_grad():
             activate_experts = output_split_sizes[: self.num_experts_per_ep].clone()
@@ -695,6 +699,10 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
+        # TODO: upgrade transformers to 4.44.0 to fix the bug, remove the hard code.
+        self._use_flash_attention_2 = shard_config.enable_flash_attention
+        self._use_sdpa = False if shard_config.enable_flash_attention else self._use_sdpa
+
         if self._use_flash_attention_2:
             # 2d mask is passed through the layers
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index 99b31745bb05..4e8f67407bc6 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -1,8 +1,9 @@
 import math
 import warnings
-from typing import List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import torch
+import torch.distributed
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
@@ -24,14 +25,14 @@
 from transformers.utils import logging
 
 from colossalai.pipeline.stage_manager import PipelineStageManager
-from colossalai.shardformer.layer._operation import (
-    all_to_all_comm,
-    gather_forward_split_backward,
-    split_forward_gather_backward,
-)
+from colossalai.shardformer.layer import AttnMaskType
+from colossalai.shardformer.layer._operation import all_to_all_comm, gather_sp_output, split_forward_gather_backward
+from colossalai.shardformer.layer.utils import is_share_sp_tp, split_batch_zigzag
 from colossalai.shardformer.shard import ShardConfig
 
-from ..layer import ColoAttention, dist_cross_entropy
+from ..layer import ColoAttention, RingAttention, dist_cross_entropy
+
+_SUPPORTED_SP_MODE = ["all_to_all", "split_gather", "ring", "ring_attn"]
 
 
 class LlamaPipelineForwards:
@@ -57,6 +58,10 @@ def llama_model_forward(
         hidden_states: Optional[torch.FloatTensor] = None,
         stage_index: Optional[List[int]] = None,
         shard_config: ShardConfig = None,
+        # Split output only when computing cross entropy using llama_for_causal_lm_forward
+        # or get_lm_forward_with_dist_cross_entropy
+        # Default to True to avoid bug when calling classification forward from huggingface
+        force_sp_output_gather: bool = True,
     ):
         logger = logging.get_logger(__name__)
 
@@ -97,7 +102,7 @@ def llama_model_forward(
         sp_group = shard_config.sequence_parallel_process_group
         sp_size = shard_config.sequence_parallel_size
         if sp_mode == "all_to_all" and not stage_manager.is_first_stage():
-            # For correct positions ids. The states will be gather along the seq dim in the attention layer later.
+            # For generating full positions ids, as the states will be gather along the seq dim in the attention layer later.
             seq_length *= sp_size
 
         past_seen_tokens = 0
@@ -127,22 +132,36 @@ def llama_model_forward(
             position_ids = cache_position.unsqueeze(0)
         # embed positions, for the first stage, hidden_states is the input embeddings,
         # for the other stages, hidden_states is the output of the previous stage
-        if shard_config.enable_flash_attention:
+        if not stage_manager.is_first_stage() and sp_mode == "ring_attn":
+            _, attn_kwargs, _ = RingAttention.prepare_varlen_batch(attention_mask, sp_group)
+        elif shard_config.enable_flash_attention:
             # in this case, attention_mask is a dict rather than a tensor
             mask_shape = (batch_size, 1, seq_length_with_past, seq_length_with_past)
-            attention_mask = ColoAttention.prepare_attn_kwargs(
+            attn_kwargs = ColoAttention.prepare_attn_kwargs(
                 mask_shape,
                 hidden_states.dtype,
                 hidden_states.device,
                 q_padding_mask=attention_mask,
                 is_causal=True,
+                invert=(sp_mode != "ring_attn"),
             )
         else:
-            attention_mask = self._update_causal_mask(attention_mask, hidden_states, cache_position)
+            attn_kwargs = self._update_causal_mask(attention_mask, hidden_states, cache_position)
 
         # Support SP + PP
+        # TODO: support padded casual cu_seqlens across stages
         if stage_manager.is_first_stage():
-            if sp_mode in ["ring", "split_gather"]:
+            # Ring Attention zigzag batch processing
+            if sp_mode == "ring_attn":
+                assert shard_config.enable_flash_attention, "Ring Attention inherently requires Flash Attention."
+                if attn_kwargs["attention_mask_type"] == AttnMaskType.PADDED_CAUSAL:
+                    hidden_states, attn_kwargs, position_ids = RingAttention.prepare_varlen_batch(
+                        attention_mask, sp_group, hidden_states, position_ids
+                    )
+                else:
+                    hidden_states, position_ids = split_batch_zigzag([hidden_states, position_ids], sp_group)
+
+            elif is_share_sp_tp(sp_mode):
                 hidden_states = split_forward_gather_backward(
                     hidden_states, 1, sp_group, fp8_communication=shard_config.fp8_communication
                 )
@@ -181,12 +200,11 @@ def llama_model_forward(
         for idx, decoder_layer in enumerate(self.layers[start_idx:end_idx], start=start_idx):
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
-
             if idx - start_idx < num_ckpt_layers:
                 layer_outputs = self._gradient_checkpointing_func(
                     decoder_layer.__call__,
                     hidden_states,
-                    attention_mask,
+                    attn_kwargs,
                     position_ids,
                     past_key_values,
                     output_attentions,
@@ -196,14 +214,13 @@ def llama_model_forward(
             else:
                 layer_outputs = decoder_layer(
                     hidden_states,
-                    attention_mask=attention_mask,
+                    attention_mask=attn_kwargs,
                     position_ids=position_ids,
                     past_key_value=past_key_values,
                     output_attentions=output_attentions,
                     use_cache=use_cache,
                     cache_position=cache_position,
                 )
-
             hidden_states = layer_outputs[0]
 
             if use_cache:
@@ -213,13 +230,9 @@ def llama_model_forward(
 
         if stage_manager.is_last_stage():
             hidden_states = self.norm(hidden_states)
-            if sp_mode == "ring" or sp_mode == "split_gather":
-                hidden_states = gather_forward_split_backward(
-                    hidden_states, 1, sp_group, fp8_communication=shard_config.fp8_communication
-                )
-            elif sp_mode == "all_to_all":
-                hidden_states = gather_forward_split_backward(
-                    hidden_states, 1, sp_group, grad_scale=sp_size, fp8_communication=shard_config.fp8_communication
+            if (not shard_config.parallel_output) or force_sp_output_gather or is_share_sp_tp(sp_mode):
+                hidden_states = gather_sp_output(
+                    hidden_states, sp_group, sp_mode, fp8_communication=shard_config.fp8_communication
                 )
 
         # add hidden states from the last decoder layer
@@ -306,6 +319,15 @@ def llama_for_causal_lm_forward(
             logger.warning_once("output_hidden_states=True is not supported for pipeline models at the moment.")
             output_hidden_states = False
 
+        if shard_config.sequence_parallelism_mode == "ring_attn" and shard_config.parallel_output:
+            # Split labels in a zigzag fashion too
+            sp_group = shard_config.sequence_parallel_process_group
+            if attention_mask.bool().all():
+                labels = split_batch_zigzag(labels, sp_group, seq_dim=1)
+            else:
+                # [B, max_seqlen // sp_size]
+                labels, _, _ = RingAttention.prepare_varlen_batch(attention_mask, sp_group, labels, is_label=True)
+
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         outputs = LlamaPipelineForwards.llama_model_forward(
             self.model,
@@ -323,6 +345,7 @@ def llama_for_causal_lm_forward(
             hidden_states=hidden_states,
             stage_index=stage_index,
             shard_config=shard_config,
+            force_sp_output_gather=False,
         )
         past_key_values = None
 
@@ -469,7 +492,7 @@ def get_llama_flash_attention_forward(shard_config: ShardConfig, sp_mode=None, s
     def forward(
         self,
         hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[Union[torch.Tensor, Dict]] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_value: Optional[Cache] = None,
         output_attentions: bool = False,
@@ -478,7 +501,7 @@ def forward(
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
         if sp_mode is not None:
-            assert sp_mode in ["all_to_all", "split_gather", "ring"], "Invalid sp_mode"
+            assert sp_mode in _SUPPORTED_SP_MODE, f"SP mode {sp_mode} is not supported by {type(self)} yet"
             assert (sp_size is not None) and (
                 sp_group is not None
             ), "Must specify sp_size and sp_group for sequence parallel"
@@ -489,7 +512,7 @@ def forward(
 
         bsz, q_len, _ = hidden_states.size()
         # sp: modify sp_len when sequence parallel mode is ring
-        if sp_mode in ["split_gather", "ring"]:
+        if is_share_sp_tp(sp_mode):
             q_len *= sp_size
 
         if self.config.pretraining_tp > 1:
@@ -534,6 +557,7 @@ def forward(
                 )
 
             kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
         cos, sin = self.rotary_emb(value_states, position_ids)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
@@ -545,12 +569,21 @@ def forward(
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
-        if shard_config.enable_flash_attention:
+        if sp_mode == "ring_attn":
+            attn_output = RingAttention.attention(
+                query_states,
+                key_states,
+                value_states,
+                sp_group,
+                **attention_mask,
+                inner_ring_size=shard_config.inner_ring_size,
+            )
+
+        elif shard_config.enable_flash_attention:
             assert isinstance(attention_mask, dict), "Flash Attention Error: attention_mask should be a dict."
             attn_output = ColoAttention.attention(query_states, key_states, value_states, **attention_mask)
         else:
             attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
             if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
                 raise ValueError(
                     f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
@@ -613,6 +646,10 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
+        # Split output only when computing cross entropy using llama_for_causal_lm_forward
+        # or get_lm_forward_with_dist_cross_entropy
+        # Default to True to avoid bug when calling classification forward from huggingface
+        force_sp_output_gather: bool = True,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -639,32 +676,45 @@ def forward(
 
         past_seen_tokens = 0
         seq_len = inputs_embeds.shape[1]
+        batch_size = inputs_embeds.shape[0]
         if use_cache:  # kept for BC (cache positions)
             if not isinstance(past_key_values, StaticCache):
                 past_key_values = DynamicCache.from_legacy_cache(past_key_values)
                 past_seen_tokens = past_key_values.get_seq_length()
+
         if cache_position is None:
             if isinstance(past_key_values, StaticCache):
                 raise ValueError("cache_position is a required argument when using StaticCache.")
             cache_position = torch.arange(past_seen_tokens, past_seen_tokens + seq_len, device=inputs_embeds.device)
-
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
 
-        # in this case, attention_mask is a dict rather than a tensor
         if shard_config.enable_flash_attention:
-            mask_shape = (inputs_embeds.shape[0], 1, seq_len, past_seen_tokens + seq_len)
-            attention_mask = ColoAttention.prepare_attn_kwargs(
+            mask_shape = (batch_size, 1, seq_len, past_seen_tokens + seq_len)
+            attn_kwargs: dict = ColoAttention.prepare_attn_kwargs(
                 mask_shape,
                 inputs_embeds.dtype,
                 inputs_embeds.device,
                 q_padding_mask=attention_mask,
                 is_causal=True,
+                invert=(sp_mode != "ring_attn"),
             )
+
         else:
-            attention_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
+            attn_kwargs: torch.Tensor = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
+
+        # Ring Attention zigzag batch processing
+        if sp_mode == "ring_attn":
+            assert shard_config.enable_flash_attention, "Ring Attention inherently requires Flash Attention."
+            if attn_kwargs["attention_mask_type"] == AttnMaskType.PADDED_CAUSAL:
+                inputs_embeds, attn_kwargs, position_ids = RingAttention.prepare_varlen_batch(
+                    attention_mask, sp_group, inputs_embeds, position_ids
+                )
+            else:
+                inputs_embeds, position_ids = split_batch_zigzag([inputs_embeds, position_ids], sp_group)
+                attn_kwargs = {"attention_mask_type": attn_kwargs["attention_mask_type"]}  # drop redundant tensors
 
-        if sp_mode in ["ring", "split_gather"]:
+        elif is_share_sp_tp(sp_mode):
             inputs_embeds = split_forward_gather_backward(
                 inputs_embeds, 1, sp_group, fp8_communication=shard_config.fp8_communication
             )
@@ -686,7 +736,7 @@ def forward(
                 layer_outputs = self._gradient_checkpointing_func(
                     decoder_layer.__call__,
                     hidden_states,
-                    attention_mask,
+                    attn_kwargs,
                     position_ids,
                     past_key_values,
                     output_attentions,
@@ -697,7 +747,7 @@ def forward(
             else:
                 layer_outputs = decoder_layer(
                     hidden_states,
-                    attention_mask=attention_mask,
+                    attention_mask=attn_kwargs,
                     position_ids=position_ids,
                     past_key_value=past_key_values,
                     output_attentions=output_attentions,
@@ -714,14 +764,10 @@ def forward(
                 all_self_attns += (layer_outputs[1],)
 
         hidden_states = self.norm(hidden_states)
-
-        if sp_mode == "ring" or sp_mode == "split_gather":
-            hidden_states = gather_forward_split_backward(
-                hidden_states, 1, sp_group, fp8_communication=shard_config.fp8_communication
-            )
-        elif sp_mode == "all_to_all":
-            hidden_states = gather_forward_split_backward(
-                hidden_states, 1, sp_group, grad_scale=sp_size, fp8_communication=shard_config.fp8_communication
+        # Cases that don't support parallelizing cross entropy computation along sequence
+        if (not shard_config.parallel_output) or is_share_sp_tp(sp_mode) or force_sp_output_gather:
+            hidden_states = gather_sp_output(
+                hidden_states, sp_group, sp_mode, fp8_communication=shard_config.fp8_communication
             )
 
         # add hidden states from the last decoder layer
@@ -795,6 +841,15 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        if shard_config.sequence_parallelism_mode == "ring_attn" and shard_config.parallel_output:
+            # Special processing: Split labels in a zigzag fashion too
+            sp_group = shard_config.sequence_parallel_process_group
+            if attention_mask.bool().all():
+                labels = split_batch_zigzag(labels, sp_group, seq_dim=1, is_label=True)
+            else:
+                # [B, max_seq_len // sp_size]
+                labels, _, _ = RingAttention.prepare_varlen_batch(attention_mask, sp_group, labels, is_label=True)
+
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         outputs = self.model(
             input_ids=input_ids,
@@ -807,6 +862,7 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             cache_position=cache_position,
+            force_sp_output_gather=False,
         )
 
         hidden_states = outputs[0]
@@ -817,7 +873,6 @@ def forward(
         else:
             logits = self.lm_head(hidden_states)
         logits = logits.float()
-
         loss = dist_cross_entropy(
             labels, logits, shard_config, self.lm_head.out_features, self.config.vocab_size, self.model.dtype
         )
diff --git a/colossalai/shardformer/modeling/mixtral.py b/colossalai/shardformer/modeling/mixtral.py
index 76b441fe4258..4850ef1b6929 100644
--- a/colossalai/shardformer/modeling/mixtral.py
+++ b/colossalai/shardformer/modeling/mixtral.py
@@ -696,7 +696,9 @@ def forward(
         # sp: all-to-all comminucation when introducing sequence parallel
         if sp_mode == "all_to_all":
             attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim).contiguous()  # (1, 8, 128)
-            attn_output = all_to_all_comm(attn_output, sp_group, scatter_dim=1, gather_dim=2)  # (1, 4, 256)
+            attn_output = all_to_all_comm(
+                attn_output, sp_group, scatter_dim=1, gather_dim=2, fp8_communication=shard_config.fp8_communication
+            )  # (1, 4, 256)
         else:
             attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
 
diff --git a/colossalai/shardformer/policies/base_policy.py b/colossalai/shardformer/policies/base_policy.py
index 282cf0464794..7c1e6f0d762d 100644
--- a/colossalai/shardformer/policies/base_policy.py
+++ b/colossalai/shardformer/policies/base_policy.py
@@ -75,6 +75,7 @@ class Policy(ABC):
     def __init__(self) -> None:
         self.shard_config: Optional[ShardConfig] = None
         self.model: Optional[Module] = None
+        self.is_causal = None  # Whether we're doing causal lm, i.e. using cross entropy
 
     def set_model(self, model: nn.Module) -> None:
         r"""
diff --git a/colossalai/shardformer/policies/command.py b/colossalai/shardformer/policies/command.py
index d0903b11c9f3..323480d6d084 100644
--- a/colossalai/shardformer/policies/command.py
+++ b/colossalai/shardformer/policies/command.py
@@ -69,13 +69,18 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
         sp_size = self.shard_config.sequence_parallel_size or None
         sp_group = self.shard_config.sequence_parallel_process_group or None
         sp_partial_derived = sp_mode in ["split_gather", "ring"]
+        if sp_mode == "ring_attn" and not self.is_causal:
+            raise ValueError("Ring attention is only meant for causal language modeling.")
 
+        tp_size = self.shard_config.tensor_parallel_size or None
+        num_q_heads = self.model.config.num_attention_heads
+        num_kv_heads = getattr(self.model.config, "num_key_value_heads", None)
         if sp_mode == "all_to_all":
-            decoder_attribute_replacement = {
-                "num_heads": self.model.config.num_attention_heads // sp_size,
-            }
-            if getattr(self.model.config, "num_key_value_heads", False):
-                decoder_attribute_replacement["num_key_value_heads"] = self.model.config.num_key_value_heads // sp_size
+            num_q_heads //= sp_size
+            decoder_attribute_replacement = {"num_heads": num_q_heads}
+            if num_kv_heads:
+                num_kv_heads //= sp_size
+                decoder_attribute_replacement["num_key_value_heads"] = num_kv_heads
 
             policy[attn_cls] = ModulePolicyDescription(
                 attribute_replacement=decoder_attribute_replacement,
@@ -104,21 +109,18 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
 
         if self.shard_config.enable_tensor_parallelism:
             assert (
-                self.model.config.num_attention_heads % self.shard_config.tensor_parallel_size == 0
+                num_q_heads % tp_size == 0
             ), f"The number of attention heads must be divisible by tensor parallel size."
             if hasattr(self.model.config, "num_key_value_heads"):
                 assert (
-                    self.model.config.num_key_value_heads >= self.shard_config.tensor_parallel_size
-                    and self.model.config.num_key_value_heads % self.shard_config.tensor_parallel_size == 0
+                    num_kv_heads >= tp_size and num_kv_heads % tp_size == 0
                 ), f"The number of key_value heads must be divisible by, and must not be less than tensor parallel size."
             decoder_attribute_replacement = {
-                "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
-                "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
+                "self_attn.hidden_size": self.model.config.hidden_size // tp_size,
+                "self_attn.num_heads": num_q_heads // tp_size,
             }
             if getattr(self.model.config, "num_key_value_heads", False):
-                decoder_attribute_replacement["self_attn.num_key_value_heads"] = (
-                    self.model.config.num_key_value_heads // self.shard_config.tensor_parallel_size
-                )
+                decoder_attribute_replacement["self_attn.num_key_value_heads"] = num_kv_heads // tp_size
 
             policy[CohereDecoderLayer] = ModulePolicyDescription(
                 attribute_replacement=decoder_attribute_replacement,
@@ -297,10 +299,11 @@ class CommandForCausalLMPolicy(CommandPolicy):
     def module_policy(self):
         from transformers import CohereForCausalLM
 
+        self.is_causal = True
         policy = super().module_policy()
 
         if self.shard_config.enable_tensor_parallelism:
-            # add a new item for casual lm
+            # add a new item for causal lm
             new_item = {
                 CohereForCausalLM: ModulePolicyDescription(
                     sub_module_replacement=[
diff --git a/colossalai/shardformer/policies/llama.py b/colossalai/shardformer/policies/llama.py
index 23d2d39134d3..429eec52f7bc 100644
--- a/colossalai/shardformer/policies/llama.py
+++ b/colossalai/shardformer/policies/llama.py
@@ -69,13 +69,20 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
         sp_size = self.shard_config.sequence_parallel_size or None
         sp_group = self.shard_config.sequence_parallel_process_group or None
         sp_partial_derived = sp_mode in ["split_gather", "ring"]
+        if sp_mode == "ring_attn" and not self.is_causal:
+            raise ValueError("Ring attention is only meant for causal language modeling.")
+
+        tp_size = self.shard_config.tensor_parallel_size
+        # Modified by SP and TP
+        num_q_heads = self.model.config.num_attention_heads
+        num_kv_heads = getattr(self.model.config, "num_key_value_heads", None)
 
         if sp_mode == "all_to_all":
-            decoder_attribute_replacement = {
-                "num_heads": self.model.config.num_attention_heads // sp_size,
-            }
-            if getattr(self.model.config, "num_key_value_heads", False):
-                decoder_attribute_replacement["num_key_value_heads"] = self.model.config.num_key_value_heads // sp_size
+            num_q_heads //= sp_size
+            decoder_attribute_replacement = {"num_heads": num_q_heads}
+            if num_kv_heads:
+                num_kv_heads //= sp_size
+                decoder_attribute_replacement["num_key_value_heads"] = num_kv_heads
 
             policy[attn_cls] = ModulePolicyDescription(
                 attribute_replacement=decoder_attribute_replacement,
@@ -104,21 +111,20 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
 
         if self.shard_config.enable_tensor_parallelism:
             assert (
-                self.model.config.num_attention_heads % self.shard_config.tensor_parallel_size == 0
+                num_q_heads % tp_size == 0
             ), f"The number of attention heads must be divisible by tensor parallel size."
             if hasattr(self.model.config, "num_key_value_heads"):
                 assert (
-                    self.model.config.num_key_value_heads >= self.shard_config.tensor_parallel_size
-                    and self.model.config.num_key_value_heads % self.shard_config.tensor_parallel_size == 0
+                    num_kv_heads >= tp_size and num_kv_heads % tp_size == 0
                 ), f"The number of key_value heads must be divisible by, and must not be less than tensor parallel size."
+            num_q_heads //= tp_size
             decoder_attribute_replacement = {
-                "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
-                "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
+                "self_attn.hidden_size": self.model.config.hidden_size // tp_size,
+                "self_attn.num_heads": num_q_heads,
             }
             if getattr(self.model.config, "num_key_value_heads", False):
-                decoder_attribute_replacement["self_attn.num_key_value_heads"] = (
-                    self.model.config.num_key_value_heads // self.shard_config.tensor_parallel_size
-                )
+                num_kv_heads //= tp_size
+                decoder_attribute_replacement["self_attn.num_key_value_heads"] = num_kv_heads
 
             policy[LlamaDecoderLayer] = ModulePolicyDescription(
                 attribute_replacement=decoder_attribute_replacement,
@@ -302,10 +308,11 @@ class LlamaForCausalLMPolicy(LlamaPolicy):
     def module_policy(self):
         from transformers import LlamaForCausalLM
 
+        self.is_causal = True
         policy = super().module_policy()
 
         if self.shard_config.enable_tensor_parallelism:
-            # add a new item for casual lm
+            # add a new item for causal lm
             new_item = {
                 LlamaForCausalLM: ModulePolicyDescription(
                     sub_module_replacement=[
@@ -321,10 +328,6 @@ def module_policy(self):
                     ],
                 )
             }
-            if self.shard_config.parallel_output:
-                new_item[LlamaForCausalLM].method_replacement = {
-                    "forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)
-                }
         else:
             new_item = {
                 LlamaForCausalLM: ModulePolicyDescription(
@@ -344,7 +347,11 @@ def module_policy(self):
             self.set_pipeline_forward(
                 model_cls=LlamaForCausalLM, new_forward=LlamaPipelineForwards.llama_for_causal_lm_forward, policy=policy
             )
-
+        elif self.shard_config.enable_tensor_parallelism or self.shard_config.enable_sequence_parallelism:
+            # Compute loss distributedly along the sequence dimension
+            new_item[LlamaForCausalLM].method_replacement = {
+                "forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)
+            }
         return policy
 
     def get_held_layers(self) -> List[Module]:
@@ -384,7 +391,12 @@ def module_policy(self):
                 LlamaForSequenceClassification: ModulePolicyDescription(
                     sub_module_replacement=[
                         SubModuleReplacementDescription(
-                            suffix="score", target_module=Linear1D_Col, kwargs=dict(gather_output=True)
+                            suffix="score",
+                            target_module=Linear1D_Col,
+                            kwargs=dict(
+                                gather_output=True,
+                                fp8_communication=self.shard_config.fp8_communication,
+                            ),
                         )
                     ]
                 )
diff --git a/colossalai/shardformer/policies/mistral.py b/colossalai/shardformer/policies/mistral.py
index 72a5158e5269..4d16038c11b7 100644
--- a/colossalai/shardformer/policies/mistral.py
+++ b/colossalai/shardformer/policies/mistral.py
@@ -299,7 +299,7 @@ def module_policy(self):
         policy = super().module_policy()
 
         if self.shard_config.enable_tensor_parallelism:
-            # add a new item for casual lm
+            # add a new item for causal lm
             new_item = {
                 MistralForCausalLM: ModulePolicyDescription(
                     sub_module_replacement=[
diff --git a/colossalai/shardformer/policies/mixtral.py b/colossalai/shardformer/policies/mixtral.py
index 550b7f2360f5..3a373889c5c6 100644
--- a/colossalai/shardformer/policies/mixtral.py
+++ b/colossalai/shardformer/policies/mixtral.py
@@ -144,10 +144,14 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                 description=SubModuleReplacementDescription(
                     suffix="embed_tokens",
                     target_module=embedding_cls,
-                    kwargs={
-                        "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
-                        "fp8_communication": self.shard_config.fp8_communication,
-                    },
+                    kwargs=(
+                        {
+                            "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                            "fp8_communication": self.shard_config.fp8_communication,
+                        }
+                        if self.shard_config.enable_tensor_parallelism
+                        else {"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by}
+                    ),
                 ),
                 policy=policy,
                 target_key=MixtralModel,
@@ -164,7 +168,6 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                             "ep_group": self.shard_config.ep_group,
                             "tp_group": self.shard_config.tensor_parallel_process_group,
                             "moe_dp_group": self.shard_config.moe_dp_group,
-                            "fp8_communication": self.shard_config.fp8_communication,
                         },
                     )
                 ],
@@ -285,7 +288,7 @@ def module_policy(self):
         policy = super().module_policy()
         # TODO: assign pg mesh from plugin to all modules
         if self.shard_config.enable_tensor_parallelism:
-            # add a new item for casual lm
+            # add a new item for causal lm
             new_item = {
                 MixtralForCausalLM: ModulePolicyDescription(
                     sub_module_replacement=[
diff --git a/colossalai/shardformer/shard/shard_config.py b/colossalai/shardformer/shard/shard_config.py
index 8e33d786b6f9..1219119bb095 100644
--- a/colossalai/shardformer/shard/shard_config.py
+++ b/colossalai/shardformer/shard/shard_config.py
@@ -10,7 +10,7 @@
 from .grad_ckpt_config import GradientCheckpointConfig
 
 __all__ = ["ShardConfig"]
-SUPPORT_SP_MODE = ["split_gather", "ring", "all_to_all"]
+SUPPORT_SP_MODE = ["split_gather", "ring", "all_to_all", "ring_attn"]
 
 
 @dataclass
@@ -30,6 +30,8 @@ class ShardConfig:
         gradient_checkpoint_config (Optional[GradientCheckpointConfig]): The gradient checkpoint config. Defaults to None.
         enable_all_optimization (bool): Whether to turn on all optimization tools including 'fused normalization', 'flash attention', 'JIT fused operators', 'sequence parallelism' and 'sequence overlap'. Defaults to False.
         fp8_communication (bool, optional): Whether to enable fp8 communication in model parallelism. Defaults to False.
+        parallel_output (bool): For TP: whether to use parallelize cross entropy computation along the feature dim.
+            For SP: set to True to NOT gather the output along the seq dim.
     """
 
     tensor_parallel_process_group: Optional[ProcessGroup] = None
@@ -48,6 +50,8 @@ class ShardConfig:
     gradient_checkpoint_config: Optional[GradientCheckpointConfig] = None
     extra_kwargs: Dict[str, Any] = field(default_factory=dict)
 
+    # For ring attention
+    inner_ring_size: Optional[int] = None
     # for moe related
     moe_dp_group: Optional[ProcessGroup] = None
     ep_group: Optional[ProcessGroup] = None
@@ -81,9 +85,9 @@ def __post_init__(self):
                     self.enable_tensor_parallelism
                 ), f"sequence parallelism mode {self.sequence_parallelism_mode} can only be used when enable_tensor_parallelism is True"
             elif self.sequence_parallelism_mode in ["all_to_all"]:
-                assert (
-                    not self.enable_tensor_parallelism
-                ), f"sequence parallelism mode {self.sequence_parallelism_mode} can only be used when enable_tensor_parallelism is False"
+                # assert (
+                #     not self.enable_tensor_parallelism
+                # ), f"sequence parallelism mode {self.sequence_parallelism_mode} can only be used when enable_tensor_parallelism is False"
                 if self.enable_sequence_overlap:
                     self.enable_sequence_overlap = False
                     warnings.warn(
diff --git a/colossalai/testing/utils.py b/colossalai/testing/utils.py
index 5f6864ff0059..90d35dc851bd 100644
--- a/colossalai/testing/utils.py
+++ b/colossalai/testing/utils.py
@@ -176,7 +176,7 @@ def test_something():
     else:
         exception = Exception
 
-    func_wrapper = rerun_on_exception(exception_type=exception, pattern=".*Address already in use.*")
+    func_wrapper = rerun_on_exception(exception_type=exception, pattern=".*(A|a)ddress already in use.*")
     return func_wrapper
 
 
diff --git a/colossalai/zero/gemini/gemini_optimizer.py b/colossalai/zero/gemini/gemini_optimizer.py
index 1d755c417b48..fdf2a497626f 100644
--- a/colossalai/zero/gemini/gemini_optimizer.py
+++ b/colossalai/zero/gemini/gemini_optimizer.py
@@ -1,7 +1,6 @@
 # this code is inspired by the DeepSpeed library and implemented with our own design from scratch
 import copy
 import math
-import warnings
 from typing import Any, Dict, Iterator, OrderedDict, Set, Tuple, Union
 
 import torch
@@ -136,7 +135,7 @@ def __init__(
         self.tp_rank = dist.get_rank(tp_group) if tp_group is not None else 0
         self.verbose = verbose
         self.param_groups_backup = list()
-
+        self.logger = get_dist_logger()
         # Mapping from integer id to real/fake param tensor, used for checkpointing.
         self.id_to_real_params: Dict[int, Parameter] = dict()
         self.id_to_fake_params: Dict[int, Parameter] = dict()
@@ -148,9 +147,10 @@ def __init__(
         for name, param in module.named_parameters():
             if is_ddp_ignored(param):
                 if param.requires_grad:
-                    warnings.warn(
+                    self.logger.warning(
                         f"Parameter `{name}` is ignored by DDP but requires gradient! "
-                        "You should handle its optimizer update by yourself!"
+                        "You should handle its optimizer update by yourself!",
+                        ranks=[0],
                     )
             else:
                 ddp_param_list.append(param)
@@ -842,7 +842,9 @@ def clip_grad_by_norm(
         *args,
         **kwargs,
     ) -> torch.Tensor:
-        warnings.warn(f"Gemini controls grad clipping by itself, so you should not use clip_grad_by_norm")
+        self.logger.warning(
+            f"Gemini controls grad clipping by itself, so you should not use clip_grad_by_norm", ranks=[0]
+        )
 
 
 class GeminiAdamOptimizer(GeminiOptimizer):
diff --git a/docs/source/en/basics/launch_colossalai.md b/docs/source/en/basics/launch_colossalai.md
index 8a6028d6c49a..32748dae1163 100644
--- a/docs/source/en/basics/launch_colossalai.md
+++ b/docs/source/en/basics/launch_colossalai.md
@@ -131,17 +131,18 @@ with one simple command. There are two ways you can launch multi-node jobs.
 
 This is suitable when you only have a few nodes. Let's say I have two nodes, namely `host1` and `host2`,  I can start
 multi-node training with the following command. Compared to single-node training, you must specify the `master_addr`
-option, which is auto-set to localhost if running on a single node only.
+option, which is auto-set to localhost if running on a single node only. \
+Additionally, you must also ensure that all nodes share the same open ssh port, which can be specified using --ssh-port.
 
 :::caution
 
-`master_addr` cannot be localhost when running on multiple nodes, it should be the hostname or IP address of a node.
+`master_addr` cannot be localhost when running on multiple nodes, it should be the **hostname or IP address** of a node.
 
 :::
 
 ```shell
 # run on these two nodes
-colossalai run --nproc_per_node 4 --host host1,host2 --master_addr host1 test.py
+colossalai run --nproc_per_node 4 --host host1,host2 --master_addr host1 test.py --ssh-port 22
 ```
 - Run with `--hostfile`
 
diff --git a/docs/source/zh-Hans/basics/launch_colossalai.md b/docs/source/zh-Hans/basics/launch_colossalai.md
index a80d16717e40..9e40f64c2124 100644
--- a/docs/source/zh-Hans/basics/launch_colossalai.md
+++ b/docs/source/zh-Hans/basics/launch_colossalai.md
@@ -116,17 +116,17 @@ colossalai run --nproc_per_node 4 --master_port 29505 test.py
 - 通过`--hosts`来启动
 
 这个方式适合节点数不多的情况。假设我们有两个节点，分别为`host`和`host2`。我们可以用以下命令进行多节点训练。
-比起单节点训练，多节点训练需要手动设置`--master_addr` （在单节点训练中`master_addr`默认为`127.0.0.1`）。
+比起单节点训练，多节点训练需要手动设置`--master_addr` （在单节点训练中`master_addr`默认为`127.0.0.1`）。同时，你需要确保每个节点都使用同一个ssh port。可以通过--ssh-port设置。
 
 :::caution
 
-多节点训练时，`master_addr`不能为`localhost`或者`127.0.0.1`，它应该是一个节点的名字或者IP地址。
+多节点训练时，`master_addr`不能为`localhost`或者`127.0.0.1`，它应该是一个节点的**名字或者IP地址**。
 
 :::
 
 ```shell
 # 在两个节点上训练
-colossalai run --nproc_per_node 4 --host host1,host2 --master_addr host1 test.py
+colossalai run --nproc_per_node 4 --host host1,host2 --master_addr host1 test.py --ssh-port 22
 ```
 
 
diff --git a/examples/language/llama/benchmark.py b/examples/language/llama/benchmark.py
index 21d081145cd9..bb14378ad571 100644
--- a/examples/language/llama/benchmark.py
+++ b/examples/language/llama/benchmark.py
@@ -28,6 +28,7 @@
 # Constants
 # ==============================
 
+# We have lots of llamas for your choice!
 MODEL_CONFIGS = {
     "100m": LlamaConfig(
         max_position_embeddings=4096,
@@ -36,6 +37,7 @@
         intermediate_size=2048,
         hidden_size=1024,
     ),
+    "5b": LlamaConfig(max_position_embeddings=4096, num_key_value_heads=8),
     "7b": LlamaConfig(max_position_embeddings=4096),
     "13b": LlamaConfig(
         hidden_size=5120,
@@ -68,9 +70,6 @@ def main():
         default="gemini",
         help="Choose which plugin to use",
     )
-    parser.add_argument(
-        "--overlap", action="store_true", help="Overlap communication with computation in Pipeline Parallel."
-    )
     parser.add_argument("-b", "--batch_size", type=int, default=2, help="Batch size")
     parser.add_argument("-s", "--num_steps", type=int, default=5, help="Number of steps to run")
     parser.add_argument("-i", "--ignore_steps", type=int, default=2, help="Number of steps to ignore")
@@ -94,13 +93,26 @@ def main():
 
     parser.add_argument("--pp_style", default="1f1b", choices=["1f1b", "interleaved"])
     parser.add_argument("--n_chunks", default=1, help="number of model chunks", type=eval)
-    parser.add_argument("--profile", action="store_true", help="Profile the code", default=False)
+    parser.add_argument("--profile", action="store_true", help="Profile the code")
+    parser.add_argument(
+        "--nsys",
+        action="store_true",
+        help="Use nsys for profiling. \
+        You should put something like this before colossalai launch: \
+        nsys profile -w true -t cuda,cudnn,cublas -s cpu --capture-range=cudaProfilerApi --capture-range-end=stop --cudabacktrace=true -x true --python-backtrace=cuda -o prof_out",
+    )
     parser.add_argument("--disable-async-reduce", action="store_true", help="Disable the asynchronous reduce operation")
     parser.add_argument("--prefetch_num", type=int, default=0, help="chunk prefetch max number")
     parser.add_argument("--no_cache", action="store_true")
     parser.add_argument("--use_fp8_comm", action="store_true", default=False, help="for using fp8 during communication")
-    parser.add_argument("--overlap_allgather", action="store_true")
     parser.add_argument("--use_fp8", action="store_true")
+    parser.add_argument("--overlap_allgather", action="store_true")
+    parser.add_argument(
+        "--sp_mode",
+        default="all_to_all",
+        choices=["all_to_all", "ring_attn", "ring", "split_gather"],
+        help="Sequence parallelism mode",
+    )
     args = parser.parse_args()
 
     colossalai.launch_from_torch()
@@ -203,13 +215,12 @@ def empty_init():
             num_model_chunks=args.n_chunks,
             zero_stage=args.zero,
             sp_size=args.sp,
+            sequence_parallelism_mode=args.sp_mode,
             enable_sequence_parallelism=args.sp > 1,
             enable_fused_normalization=torch.cuda.is_available(),
             enable_flash_attention=args.xformers,
             microbatch_size=args.mbs,
             precision="bf16",
-            dp_outside=False,
-            overlap_p2p=args.overlap,
             enable_metadata_cache=not args.no_cache,
             overlap_allgather=args.overlap_allgather,
             use_fp8=args.use_fp8,
@@ -303,8 +314,9 @@ def empty_init():
     with get_profile_context(
         args.profile,
         args.ignore_steps,
-        len(dataloader) - 1,
+        1,  # avoid creating massive log files
         save_dir=f"profile/{time.strftime('%H:%M', time.localtime())}-{args.plugin}-llama-{args.config}",
+        nsys=args.nsys,
     ) as prof:
         if isinstance(plugin, HybridParallelPlugin) and args.pp > 1:
             data_iter = iter(dataloader)
@@ -330,13 +342,16 @@ def empty_init():
                 performance_evaluator.on_step_start(step)
                 outputs = model(**batch)
                 loss = outputs[0]
+                del outputs  # free memory
+
+                if dist.get_rank() == dist.get_world_size() - 1:
+                    print(f"Step {step} loss: {loss}")
                 booster.backward(loss, optimizer)
                 optimizer.step()
                 optimizer.zero_grad()
 
                 performance_evaluator.on_step_end(**batch)
                 prof.step()
-
     performance_evaluator.on_fit_end()
     coordinator.print_on_master(f"Max CUDA memory usage: {get_accelerator().max_memory_allocated()/1024**2:.2f} MB")
 
diff --git a/examples/language/opt/README.md b/examples/language/opt/README.md
index af1e794374ed..694c5cf91acc 100644
--- a/examples/language/opt/README.md
+++ b/examples/language/opt/README.md
@@ -17,7 +17,7 @@ limitations under the License.
 ## OPT
 Meta recently released [Open Pretrained Transformer (OPT)](https://github.com/facebookresearch/metaseq), a 175-Billion parameter AI language model, which stimulates AI programmers to perform various downstream tasks and application deployments.
 
-The following example of [Colossal-AI](https://github.com/hpcaitech/ColossalAI) demonstrates fine-tuning Casual Language Modelling at low cost.
+The following example of [Colossal-AI](https://github.com/hpcaitech/ColossalAI) demonstrates fine-tuning Causal Language Modelling at low cost.
 
 
 ## Our Modifications
diff --git a/examples/language/performance_evaluator.py b/examples/language/performance_evaluator.py
index ca4a02cd2981..f5ad1d23d2a7 100644
--- a/examples/language/performance_evaluator.py
+++ b/examples/language/performance_evaluator.py
@@ -28,7 +28,7 @@ def all_reduce_mean(x: float, world_size: int) -> float:
     return tensor.item()
 
 
-def get_profile_context(enable_flag, warmup_steps, active_steps, save_dir):
+def get_profile_context(enable_flag, warmup_steps, active_steps, save_dir, nsys=False):
     class DummyProfiler:
         def __init__(self):
             self.step_number = 0
@@ -42,7 +42,29 @@ def __enter__(self):
         def __exit__(self, exc_type, exc_value, traceback):
             pass
 
+    class NsysProfiler:
+        def __init__(self, warmup_steps, active_steps):
+            self.step_number = 0
+            self.warmup_steps = warmup_steps
+            self.active_steps = active_steps
+
+        def step(self):
+            if self.step_number == self.warmup_steps:
+                torch.cuda.cudart().cudaProfilerStart()
+            elif self.step_number == self.warmup_steps + self.active_steps:
+                torch.cuda.cudart().cudaProfilerStop()
+            self.step_number += 1
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, exc_type, exc_value, traceback):
+            pass
+
     if enable_flag:
+        if nsys:
+            return NsysProfiler(warmup_steps, active_steps)
+
         return profile(
             activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
             schedule=schedule(wait=0, warmup=warmup_steps, active=active_steps),
diff --git a/examples/tutorial/opt/opt/README.md b/examples/tutorial/opt/opt/README.md
index a01209cbda0e..3776e0c64552 100644
--- a/examples/tutorial/opt/opt/README.md
+++ b/examples/tutorial/opt/opt/README.md
@@ -19,7 +19,7 @@ limitations under the License.
 ## OPT
 Meta recently released [Open Pretrained Transformer (OPT)](https://github.com/facebookresearch/metaseq), a 175-Billion parameter AI language model, which stimulates AI programmers to perform various downstream tasks and application deployments.
 
-The following example of [Colossal-AI](https://github.com/hpcaitech/ColossalAI) demonstrates fine-tuning Casual Language Modelling at low cost.
+The following example of [Colossal-AI](https://github.com/hpcaitech/ColossalAI) demonstrates fine-tuning causal Language Modelling at low cost.
 
 We are using the pre-training weights of the OPT model provided by Hugging Face Hub on the raw WikiText-2 (no tokens were replaced before
 the tokenization). This training script is adapted from the [HuggingFace Language Modelling examples](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling).
diff --git a/extensions/pybind/flash_attention/flash_attention_dao_cuda.py b/extensions/pybind/flash_attention/flash_attention_dao_cuda.py
index a108377a8dcf..560d952f6926 100644
--- a/extensions/pybind/flash_attention/flash_attention_dao_cuda.py
+++ b/extensions/pybind/flash_attention/flash_attention_dao_cuda.py
@@ -57,14 +57,14 @@ def flash_attention(
             q_indices: Optional[torch.Tensor] = None,
             kv_indices: Optional[torch.Tensor] = None,
         ):
-            # [B, N, S, D] -> [B, S, N, D]
+            # [B, H, S, D] -> [B, S, H, D]
             q = q.transpose(1, 2)
             k = k.transpose(1, 2)
             v = v.transpose(1, 2)
             b, s_q = q.shape[:2]
             if cu_seqlens_q is not None:
                 # padded / padded causal
-                # unpad input: [B, S, N, D] -> [T, N, D]
+                # unpad input: [B, S, H, D] -> [T, H, D]
                 q = _unpad_input(q, q_indices)
                 kv = _unpad_input(torch.stack(tensors=(k, v), dim=2), kv_indices)
                 attn_output = flash_attn_varlen_kvpacked_func(
@@ -78,7 +78,7 @@ def flash_attention(
                     softmax_scale=scale,
                     causal=is_causal,
                 )
-                # pad output: [T, N, D] -> [B, S, N, D]
+                # pad output: [T, H, D] -> [B, S, H, D]
                 attn_output = pad_input(attn_output, q_indices, b, s_q)
             else:
                 # causal / no attn mask
@@ -90,7 +90,7 @@ def flash_attention(
                     softmax_scale=scale,
                     causal=is_causal,
                 )
-            # [B, S, N, D] -> [B, N, S, D]
+            # [B, S, H, D] -> [B, H, S, D]
             return attn_output.transpose(1, 2)
 
         return flash_attention
diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt
index 93a3690fe1d3..3fcf53e1858e 100644
--- a/requirements/requirements-test.txt
+++ b/requirements/requirements-test.txt
@@ -9,7 +9,7 @@ torchx-nightly==2022.6.29 # torchrec 0.2.0 requires torchx-nightly. This package
 torchrec==0.2.0
 contexttimer
 einops
-triton==2.1.0
+triton
 requests==2.27.1 # downgrade to avoid huggingface error https://github.com/huggingface/transformers/issues/17611
 SentencePiece
 ninja
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 651eb66e89ab..578122d47072 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -8,7 +8,7 @@ click
 fabric
 contexttimer
 ninja
-torch>=2.1.0,<=2.3.0
+torch>=2.1.0,<=2.4.0
 safetensors
 einops
 pydantic
diff --git a/tests/kit/model_zoo/__init__.py b/tests/kit/model_zoo/__init__.py
index 66c794a7d891..9c1a11e7bc29 100644
--- a/tests/kit/model_zoo/__init__.py
+++ b/tests/kit/model_zoo/__init__.py
@@ -22,9 +22,9 @@
     "transformers_bloom_for_causal_lm",
     "transformers_falcon_for_causal_lm",
     "transformers_chatglm_for_conditional_generation",
-    "transformers_llama_for_casual_lm",
+    "transformers_llama_for_causal_lm",
     "transformers_vit_for_masked_image_modeling",
-    "transformers_mistral_for_casual_lm",
+    "transformers_mistral_for_causal_lm",
 ]
 
 IS_FAST_TEST = os.environ.get("FAST_TEST", "0") == "1"
diff --git a/tests/kit/model_zoo/transformers/command.py b/tests/kit/model_zoo/transformers/command.py
index a8b8842c5907..3f4ea45838d7 100644
--- a/tests/kit/model_zoo/transformers/command.py
+++ b/tests/kit/model_zoo/transformers/command.py
@@ -32,8 +32,8 @@ def data_gen():
 
         return dict(input_ids=input_ids, attention_mask=attention_mask)
 
-    # label is needed for casual lm
-    def data_gen_for_casual_lm():
+    # label is needed for causal lm
+    def data_gen_for_causal_lm():
         data = data_gen()
         labels = data["input_ids"].clone()
         data["labels"] = labels
@@ -44,7 +44,7 @@ def data_gen_for_casual_lm():
 
     # function to get the loss
     loss_fn = lambda output: output["last_hidden_state"].mean()
-    loss_fn_for_casual_lm = lambda output: output["loss"]
+    loss_fn_for_causal_lm = lambda output: output["loss"]
     loss_fn_for_seq_classification = lambda output: output["logits"].mean()
 
     config = CohereConfig(
@@ -70,10 +70,10 @@ def data_gen_for_casual_lm():
         model_attribute=ModelAttribute(has_control_flow=True),
     )
     model_zoo.register(
-        name="transformers_command_for_casual_lm",
+        name="transformers_command_for_causal_lm",
         model_fn=lambda: transformers.CohereForCausalLM(config),
-        data_gen_fn=data_gen_for_casual_lm,
+        data_gen_fn=data_gen_for_causal_lm,
         output_transform_fn=output_transform_fn,
-        loss_fn=loss_fn_for_casual_lm,
+        loss_fn=loss_fn_for_causal_lm,
         model_attribute=ModelAttribute(has_control_flow=True),
     )
diff --git a/tests/kit/model_zoo/transformers/llama.py b/tests/kit/model_zoo/transformers/llama.py
index 61fa560506c2..05ac9d8d24ed 100644
--- a/tests/kit/model_zoo/transformers/llama.py
+++ b/tests/kit/model_zoo/transformers/llama.py
@@ -33,20 +33,21 @@ def data_gen():
                 [1, 15043, 29892, 590, 11203, 338, 274, 1082, 1, 15043, 29892, 590, 11203, 338, 274, 1082],
             ]
         ).long()
-
-        attention_mask = torch.Tensor(
-            [
-                [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-                [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
-            ]
-        ).long()
-
+        attention_mask = torch.ones_like(input_ids)
         return dict(input_ids=input_ids, attention_mask=attention_mask)
 
-    # label is needed for casual lm
-    def data_gen_for_casual_lm():
+    # label is needed for causal lm
+    def data_gen_for_causal_lm():
         data = data_gen()
+
+        # Test padded sequence
+        padding = torch.zeros(2, data["input_ids"].shape[1] // 2, dtype=torch.long)
+        data["input_ids"] = torch.cat([data["input_ids"], padding], dim=1)
+        data["attention_mask"] = torch.cat([data["attention_mask"], padding], dim=1)
+
+        ignore_idx = -100
         labels = data["input_ids"].clone()
+        labels[~data["attention_mask"].bool()] = ignore_idx
         data["labels"] = labels
         return data
 
@@ -55,7 +56,7 @@ def data_gen_for_casual_lm():
 
     # function to get the loss
     loss_fn = lambda output: output["last_hidden_state"].mean()
-    loss_fn_for_casual_lm = lambda output: output["loss"]
+    loss_fn_for_causal_lm = lambda output: output["loss"]
     loss_fn_for_seq_classification = lambda output: output["logits"].mean()
 
     config = LlamaConfig(
@@ -70,23 +71,23 @@ def data_gen_for_casual_lm():
         config.pad_token_id = config.eos_token_id
 
     # register the following models
-    # transformers.LlamaModel,
     # transformers.LlamaForCausalLM,
+    # transformers.LlamaModel,
     # transformers.LlamaForSequenceClassification,
     model_zoo.register(
-        name="transformers_llama",
-        model_fn=lambda: transformers.LlamaModel(config),
-        data_gen_fn=data_gen,
+        name="transformers_llama_for_causal_lm",
+        model_fn=lambda: transformers.LlamaForCausalLM(config),
+        data_gen_fn=data_gen_for_causal_lm,
         output_transform_fn=output_transform_fn,
-        loss_fn=loss_fn,
+        loss_fn=loss_fn_for_causal_lm,
         model_attribute=ModelAttribute(has_control_flow=True),
     )
     model_zoo.register(
-        name="transformers_llama_for_casual_lm",
-        model_fn=lambda: transformers.LlamaForCausalLM(config),
-        data_gen_fn=data_gen_for_casual_lm,
+        name="transformers_llama",
+        model_fn=lambda: transformers.LlamaModel(config),
+        data_gen_fn=data_gen,
         output_transform_fn=output_transform_fn,
-        loss_fn=loss_fn_for_casual_lm,
+        loss_fn=loss_fn,
         model_attribute=ModelAttribute(has_control_flow=True),
     )
     model_zoo.register(
diff --git a/tests/kit/model_zoo/transformers/mistral.py b/tests/kit/model_zoo/transformers/mistral.py
index ae5a9700240a..43fc662cc840 100644
--- a/tests/kit/model_zoo/transformers/mistral.py
+++ b/tests/kit/model_zoo/transformers/mistral.py
@@ -64,7 +64,7 @@ def data_gen_for_sequence_classification():
     model_attribute=ModelAttribute(has_control_flow=True),
 )
 model_zoo.register(
-    name="transformers_mistral_for_casual_lm",
+    name="transformers_mistral_for_causal_lm",
     model_fn=lambda: transformers.MistralForCausalLM(config),
     data_gen_fn=data_gen_for_lm,
     output_transform_fn=output_transform_fn,
diff --git a/tests/kit/model_zoo/transformers/mixtral.py b/tests/kit/model_zoo/transformers/mixtral.py
index 73c0e9e2c500..40e5a7b0232d 100644
--- a/tests/kit/model_zoo/transformers/mixtral.py
+++ b/tests/kit/model_zoo/transformers/mixtral.py
@@ -53,6 +53,8 @@ def data_gen_for_sequence_classification():
     num_attention_heads=8,
     num_hidden_layers=2,
     vocab_size=1000,
+    attn_implementation="flash_attention_2",
+    torch_dtype="float16",
     output_router_logits=True,
 )
 
diff --git a/tests/kit/model_zoo/transformers/qwen2.py b/tests/kit/model_zoo/transformers/qwen2.py
index 1c26af698497..83bc9f941be7 100644
--- a/tests/kit/model_zoo/transformers/qwen2.py
+++ b/tests/kit/model_zoo/transformers/qwen2.py
@@ -33,8 +33,8 @@ def data_gen():
         attention_mask = torch.Tensor([[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]).long()
         return dict(input_ids=input_ids, attention_mask=attention_mask)
 
-    # label is needed for casual lm
-    def data_gen_for_casual_lm():
+    # label is needed for causal lm
+    def data_gen_for_causal_lm():
         data = data_gen()
         labels = data["input_ids"].clone()
         data["labels"] = labels
@@ -45,7 +45,7 @@ def data_gen_for_casual_lm():
 
     # function to get the loss
     loss_fn = lambda output: output["last_hidden_state"].mean()
-    loss_fn_for_casual_lm = lambda output: output["loss"]
+    loss_fn_for_causal_lm = lambda output: output["loss"]
     loss_fn_for_seq_classification = lambda output: output["logits"].mean()
 
     config = Qwen2Config(
@@ -72,11 +72,11 @@ def data_gen_for_casual_lm():
         model_attribute=ModelAttribute(has_control_flow=True),
     )
     model_zoo.register(
-        name="transformers_qwen2_for_casual_lm",
+        name="transformers_qwen2_for_causal_lm",
         model_fn=lambda: transformers.Qwen2ForCausalLM(config),
-        data_gen_fn=data_gen_for_casual_lm,
+        data_gen_fn=data_gen_for_causal_lm,
         output_transform_fn=output_transform_fn,
-        loss_fn=loss_fn_for_casual_lm,
+        loss_fn=loss_fn_for_causal_lm,
         model_attribute=ModelAttribute(has_control_flow=True),
     )
     model_zoo.register(
diff --git a/tests/test_booster/test_plugin/test_3d_plugin.py b/tests/test_booster/test_plugin/test_3d_plugin.py
index e57cadfd8673..3e85329553e0 100644
--- a/tests/test_booster/test_plugin/test_3d_plugin.py
+++ b/tests/test_booster/test_plugin/test_3d_plugin.py
@@ -97,7 +97,7 @@ def check_3d_plugin(init_method: str = "none", early_stop: bool = True):
 
     # TODO(ver217): add more models
     for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.get_sub_registry(
-        "transformers_llama_for_casual_lm"
+        "transformers_llama_for_causal_lm"
     ).items():
         err = run_fn(init_method, model_fn, data_gen_fn, output_transform_fn)
 
diff --git a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
index 8c59f430c2d9..c2a08a541bc7 100644
--- a/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
+++ b/tests/test_booster/test_plugin/test_low_level_zero_plugin.py
@@ -105,7 +105,7 @@ def check_low_level_zero_lora(stage, model_name, early_stop: bool = True):
     sub_model_zoo = model_zoo.get_sub_registry(model_name)
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
         task_type = None
-        if name == "transformers_llama_for_casual_lm":
+        if name == "transformers_llama_for_causal_lm":
             task_type = "CAUSAL_LM"
         if name == "transformers_llama_for_sequence_classification":
             task_type = "SEQ_CLS"
diff --git a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
index f92b5c6e5675..2a3b6e5a3a29 100644
--- a/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
+++ b/tests/test_booster/test_plugin/test_torch_ddp_plugin.py
@@ -47,7 +47,7 @@ def check_torch_ddp_plugin():
         registry = model_zoo
 
     for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in registry.items():
-        if name == "dlrm_interactionarch" or name.startswith("simple_"):
+        if name in ("dlrm_interactionarch", "transformers_mixtral") or name.startswith("simple_"):
             continue
         run_fn(model_fn, data_gen_fn, output_transform_fn)
         torch.cuda.empty_cache()
diff --git a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
index fd13ce0bfadc..b133be948c1e 100644
--- a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
@@ -74,7 +74,7 @@ def exam_state_dict_with_origin(placement_config, model_name, use_safetensors: b
 @clear_cache_before_run()
 @parameterize("placement_config", OPTIM_PLACEMENT_CONFIGS)
 @parameterize("shard", [True, False])
-@parameterize("model_name", ["transformers_llama_for_casual_lm"])
+@parameterize("model_name", ["transformers_llama_for_causal_lm"])
 @parameterize("size_per_shard", [32])
 @parameterize("tp_size", [1, 2])
 @parameterize("zero_size", [2])
diff --git a/tests/test_checkpoint_io/test_gemini_torch_compability.py b/tests/test_checkpoint_io/test_gemini_torch_compability.py
index 4897907ffc8a..ce4d10322ba5 100644
--- a/tests/test_checkpoint_io/test_gemini_torch_compability.py
+++ b/tests/test_checkpoint_io/test_gemini_torch_compability.py
@@ -20,7 +20,7 @@
 
 @clear_cache_before_run()
 @parameterize("shard", [False, True])
-@parameterize("model_name", ["transformers_llama_for_casual_lm"])
+@parameterize("model_name", ["transformers_llama_for_causal_lm"])
 def exam_torch_load_from_gemini(shard: bool, model_name: str):
     (model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values()))
     criterion = lambda x: x.mean()
diff --git a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
index 4f8f260417a3..86d7924fb828 100644
--- a/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_hybrid_parallel_plugin_checkpoint_io.py
@@ -39,7 +39,7 @@
 
 
 @parameterize("shard", [True, False])
-@parameterize("model_name", ["transformers_llama_for_casual_lm"])
+@parameterize("model_name", ["transformers_llama_for_causal_lm"])
 @parameterize("size_per_shard", [32])
 @parameterize("test_config", TEST_CONFIGS)
 @clear_cache_before_run()
diff --git a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py
index ab48944d4eaa..a8e05a25ad28 100644
--- a/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_low_level_zero_checkpoint_io.py
@@ -149,7 +149,7 @@ def check_low_level_zero_lora_checkpointIO(
         if name != "transformers_llama":
             continue
         task_type = None
-        if name == "transformers_llama_for_casual_lm":
+        if name == "transformers_llama_for_causal_lm":
             task_type = "CAUSAL_LM"
         if name == "transformers_llama_for_sequence_classification":
             task_type = "SEQ_CLS"
diff --git a/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py b/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py
index df8636141e2a..6f8eb2ad26cd 100644
--- a/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py
+++ b/tests/test_checkpoint_io/test_plugins_huggingface_compatibility.py
@@ -18,7 +18,7 @@
 
 
 @clear_cache_before_run()
-@parameterize("model_name", ["transformers_llama_for_casual_lm"])
+@parameterize("model_name", ["transformers_llama_for_causal_lm"])
 @parameterize("plugin_type", ["ddp", "zero", "gemini"])
 def exam_from_pretrained(plugin_type: str, model_name: str, shard=True, size_per_shard=32):
     (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) = next(
diff --git a/tests/test_lazy/test_models.py b/tests/test_lazy/test_models.py
index c85860a8d253..0a919955f26a 100644
--- a/tests/test_lazy/test_models.py
+++ b/tests/test_lazy/test_models.py
@@ -18,9 +18,17 @@ def test_models_lazy_init(subset, default_device):
     sub_model_zoo = model_zoo.get_sub_registry(subset, allow_empty=True)
     for name, entry in sub_model_zoo.items():
         # TODO(ver217): lazy init does not support weight norm, skip these models
-        if name in ("torchaudio_wav2vec2_base", "torchaudio_hubert_base") or name.startswith(
-            ("transformers_vit", "transformers_blip2", "transformers_whisper")
-        ):
+        if name in (
+            "torchaudio_wav2vec2_base",
+            "torchaudio_hubert_base",
+            "timm_beit",
+            "timm_vision_transformer",
+            "timm_deit",
+            "timm_beitv2",
+            "timm_deit3",
+            "timm_convit",
+            "timm_tnt_b_patch16_224",
+        ) or name.startswith(("transformers_vit", "transformers_blip2", "transformers_whisper")):
             continue
         check_lazy_init(entry, verbose=True, default_device=default_device)
 
diff --git a/tests/test_lora/test_lora.py b/tests/test_lora/test_lora.py
index 1ae17025d31e..b0ec767cc332 100644
--- a/tests/test_lora/test_lora.py
+++ b/tests/test_lora/test_lora.py
@@ -91,7 +91,7 @@ def run_lora_test():
     sub_model_zoo = model_zoo.get_sub_registry("transformers_llama")
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
         task_type = None
-        if name == "transformers_llama_for_casual_lm":
+        if name == "transformers_llama_for_causal_lm":
             task_type = "CAUSAL_LM"
         if name == "transformers_llama_for_sequence_classification":
             task_type = "SEQ_CLS"
diff --git a/tests/test_pipeline/test_schedule/test_interleaved.py b/tests/test_pipeline/test_schedule/test_interleaved.py
index a626b834a891..04a1296e60eb 100644
--- a/tests/test_pipeline/test_schedule/test_interleaved.py
+++ b/tests/test_pipeline/test_schedule/test_interleaved.py
@@ -6,6 +6,7 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+from torch.testing import assert_close
 
 import colossalai
 from colossalai.cluster import ProcessGroupMesh
@@ -107,13 +108,13 @@ def criterion(x, *args, **kwargs):
 
     # check loss
     if stage_manager.is_last_stage(ignore_chunk=True):
-        assert torch.allclose(torch_loss, pp_ret["loss"])
+        assert_close(torch_loss, pp_ret["loss"])
 
     # check gradients
     for i in range(num_model_chunk):
         idx = world_size * i + rank
-        assert torch.allclose(torch_model.layers[idx].weight.grad, sharded_model[i].weight.grad)
-        assert torch.allclose(torch_model.layers[idx].bias.grad, sharded_model[i].bias.grad)
+        assert_close(torch_model.layers[idx].weight.grad, sharded_model[i].weight.grad)
+        assert_close(torch_model.layers[idx].bias.grad, sharded_model[i].bias.grad)
 
     # step
     torch_optimizer.step()
@@ -123,8 +124,8 @@ def criterion(x, *args, **kwargs):
     # check updated param
     for i in range(num_model_chunk):
         idx = world_size * i + rank
-        assert torch.allclose(torch_model.layers[idx].weight, sharded_model[i].weight)
-        assert torch.allclose(torch_model.layers[idx].bias, sharded_model[i].bias)
+        assert_close(torch_model.layers[idx].weight, sharded_model[i].weight)
+        assert_close(torch_model.layers[idx].bias, sharded_model[i].bias)
 
     # forward only
     with torch.no_grad():
@@ -135,14 +136,14 @@ def criterion(x, *args, **kwargs):
             sharded_model, iter(input_list), criterion, pp_optimizer, return_loss=True
         )
         if stage_manager.is_last_stage(ignore_chunk=True):
-            assert torch.allclose(torch_loss, pp_ret["loss"])
+            assert_close(torch_loss, pp_ret["loss"])
 
         for layer in sharded_model:
             if layer.weight.grad is None:
                 assert layer.weight.grad is None and layer.bias.grad is None
             else:
-                assert torch.allclose(layer.weight.grad, torch.zeros_like(layer.weight.grad))
-                assert torch.allclose(layer.bias.grad, torch.zeros_like(layer.bias.grad))
+                assert_close(layer.weight.grad, torch.zeros_like(layer.weight.grad))
+                assert_close(layer.bias.grad, torch.zeros_like(layer.bias.grad))
 
 
 @pytest.mark.dist
diff --git a/tests/test_pipeline/test_schedule/test_oneF_oneB.py b/tests/test_pipeline/test_schedule/test_oneF_oneB.py
index c4bfa7b697f8..8ae4f6daabd1 100644
--- a/tests/test_pipeline/test_schedule/test_oneF_oneB.py
+++ b/tests/test_pipeline/test_schedule/test_oneF_oneB.py
@@ -6,6 +6,7 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+from torch.testing import assert_close
 
 import colossalai
 from colossalai.cluster import ProcessGroupMesh
@@ -103,13 +104,13 @@ def custom_fwd(self, x):
 
     # check loss
     if stage_manager.is_last_stage():
-        assert torch.allclose(torch_loss, pp_ret["loss"])
+        assert_close(torch_loss, pp_ret["loss"])
 
     # check gradients
     for i in range(len(sharded_model)):
         idx = rank * num_local_layer + i
-        assert torch.allclose(torch_model.layers[idx].weight.grad, sharded_model[i].weight.grad)
-        assert torch.allclose(torch_model.layers[idx].bias.grad, sharded_model[i].bias.grad)
+        assert_close(torch_model.layers[idx].weight.grad, sharded_model[i].weight.grad)
+        assert_close(torch_model.layers[idx].bias.grad, sharded_model[i].bias.grad)
 
     # step
     torch_optimizer.step()
@@ -119,8 +120,8 @@ def custom_fwd(self, x):
     # check updated param
     for i in range(len(sharded_model)):
         idx = rank * num_local_layer + i
-        assert torch.allclose(torch_model.layers[idx].weight, sharded_model[i].weight)
-        assert torch.allclose(torch_model.layers[idx].bias, sharded_model[i].bias)
+        assert_close(torch_model.layers[idx].weight, sharded_model[i].weight)
+        assert_close(torch_model.layers[idx].bias, sharded_model[i].bias)
 
     # forward only
     with torch.no_grad():
@@ -131,14 +132,14 @@ def custom_fwd(self, x):
             sharded_model, iter(input_list), criterion, pp_optimizer, return_loss=True
         )
         if stage_manager.is_last_stage():
-            assert torch.allclose(torch_loss, pp_ret["loss"])
+            assert_close(torch_loss, pp_ret["loss"])
 
         for layer in sharded_model:
             if layer.weight.grad is None:
                 assert layer.weight.grad is None and layer.bias.grad is None
             else:
-                assert torch.allclose(layer.weight.grad, torch.zeros_like(layer.weight.grad))
-                assert torch.allclose(layer.bias.grad, torch.zeros_like(layer.bias.grad))
+                assert_close(layer.weight.grad, torch.zeros_like(layer.weight.grad))
+                assert_close(layer.bias.grad, torch.zeros_like(layer.bias.grad))
 
 
 def run_dist(
diff --git a/tests/test_shardformer/test_flash_attention.py b/tests/test_shardformer/test_flash_attention.py
index 9aa24a166221..42ca6b198b5e 100644
--- a/tests/test_shardformer/test_flash_attention.py
+++ b/tests/test_shardformer/test_flash_attention.py
@@ -88,6 +88,7 @@ def check_attn_func(dtype: torch.dtype, attn_func, attn_kwargs: dict, padding_ma
         padding_mask = padding_mask[:, None, :, None].logical_not()
         ref_output = ref_output.masked_fill(padding_mask, 0)
         output = output.masked_fill(padding_mask, 0)
+
     assert_close(output, ref_output, **tols)
     output.mean().backward()
     ref_output.mean().backward()
@@ -128,6 +129,8 @@ def test_flash_attn_func(dtype: torch.dtype):
         attn_kwargs, padding_mask = gen_kwargs_func(dtype)
         for attn_func, name, need_postprocess in attn_funcs:
             print(f"{dtype}, {name}, {mask_type}")
+            if mask_type == "padded":
+                pass
             if need_postprocess:
                 check_attn_func(dtype, attn_func, post_process_kwargs_for_raw_attn(attn_kwargs), padding_mask)
             else:
diff --git a/tests/test_shardformer/test_layer/test_ring_attn.py b/tests/test_shardformer/test_layer/test_ring_attn.py
new file mode 100644
index 000000000000..1c7647a7d560
--- /dev/null
+++ b/tests/test_shardformer/test_layer/test_ring_attn.py
@@ -0,0 +1,186 @@
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from flash_attn import flash_attn_qkvpacked_func, flash_attn_varlen_qkvpacked_func
+from torch.testing import assert_close
+
+import colossalai
+from colossalai.shardformer.layer import AttnMaskType
+from colossalai.shardformer.layer.attn import AttnMaskType, RingAttention
+from colossalai.shardformer.layer.utils import split_batch_zigzag, split_varlen_zigzag
+from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.utils import get_current_device
+
+
+@parameterize("seq_len", [4096])
+@parameterize("bs", [2])
+@parameterize("nheads", [5])
+@parameterize("d", [128])
+@parameterize("dtype", [torch.bfloat16, torch.float16])
+def check_ring_attn(seq_len, bs, nheads, d, dtype):
+    torch.cuda.manual_seed(2)
+    device = get_current_device()
+    sp_group = dist.group.WORLD
+    sp_size = dist.get_world_size()
+    # Some outliers may seem large, but our errors are still lower than
+    # than Megatron-LM context parallel's
+    # (https://github.com/NVIDIA/TransformerEngine/blob/33a3d02f81c56e6f7b542c09bfa86657078d57fb/tests/pytorch/fused_attn/run_fused_attn_with_cp.py#L215)
+    # and the original zigzag implementation's (https://github.com/zhuzilin/ring-flash-attention/tree/main)
+    atol = rtol = 7e-3
+
+    # Setup inputs
+    qkv = torch.randn(bs, seq_len, 3, nheads, d, device=device, dtype=dtype, requires_grad=True)
+    local_qkv = split_batch_zigzag(qkv, sp_group)
+    q, k, v = local_qkv.unbind(dim=-3)
+    q, k, v = [x.squeeze(2).detach().clone().transpose(1, 2) for x in (q, k, v)]  # (B, nHeads, Sq, D)
+    q.requires_grad = k.requires_grad = v.requires_grad = True
+
+    # Ring attention vs single GPU
+    ring_out, ring_lse = RingAttention.attention(
+        q,
+        k,
+        v,
+        sp_group,
+        AttnMaskType.CAUSAL,
+        return_softmax=True,
+        inner_ring_size=max(2, sp_size // 2),
+        # inner_ring_size=4
+    )
+    ring_out = ring_out.transpose(1, 2)
+    out, lse, _ = flash_attn_qkvpacked_func(
+        qkv, dropout_p=0.0, causal=True, window_size=(-1, -1), alibi_slopes=None, return_attn_probs=True
+    )
+
+    # Checkout out and softmax denominator
+    local_out = split_batch_zigzag(out, sp_group)
+    local_lse = split_batch_zigzag(lse, sp_group, seq_dim=-1)
+    local_lse = local_lse.transpose(1, 2).contiguous().view(-1, ring_lse.shape[-1])  # (B, nHeads, Sq) -> (T, nHeads)
+    assert_close(ring_lse, local_lse, atol=atol, rtol=rtol)
+    assert_close(ring_out, local_out, atol=atol, rtol=rtol)
+
+    # Check grads
+    ring_out.sum().backward()
+    out.sum().backward()
+    ring_dq, ring_dk, ring_dv = [x.transpose(1, 2) for x in (q.grad, k.grad, v.grad)]
+    dqkv = qkv.grad
+    local_dqkv = split_batch_zigzag(dqkv, sp_group)
+
+    assert_close(ring_dq, local_dqkv[:, :, 0], atol=atol, rtol=rtol)
+    assert_close(ring_dk, local_dqkv[:, :, 1], atol=atol, rtol=rtol)
+    assert_close(ring_dv, local_dqkv[:, :, 2], atol=atol, rtol=rtol)
+    if dist.get_rank() == 0:
+        print(
+            f"sp_size {dist.get_world_size()}, inner ring size {dist.get_world_size(RingAttention.INNER_RING_GROUP)} passed."
+        )
+
+
+@parameterize("seqlen", [4096])
+@parameterize("bs", [2])
+@parameterize("nheads", [5])
+@parameterize("d", [128])
+@parameterize("dtype", [torch.bfloat16, torch.float16])
+def check_packed_seq(seqlen, bs, nheads, d, dtype):
+    device = get_current_device()
+    sp_group = dist.group.WORLD
+    sp_size = dist.get_world_size()
+    atol = rtol = 7e-3
+    torch.cuda.manual_seed(2)
+    # Prepare varlen attention mask
+    padding_mask = torch.ones((bs, seqlen), dtype=torch.int, device=device)
+    padding_mask[: bs // 2, (seqlen // 4) * 3 :] = 0
+    padding_mask[:, seqlen // 2 :] = 0
+
+    input_embeds = torch.randn(bs, seqlen, nheads, d, device=device, dtype=dtype, requires_grad=True)
+
+    # Forward
+    # out = ColoAttention.attention(q, k, v, **mask_info)
+    flat_input = input_embeds.view(-1, nheads, d)[padding_mask.flatten().nonzero().squeeze()]
+    qkv = torch.stack([flat_input] * 3, dim=1)
+    qkv.retain_grad()
+
+    input_embeds, mask_info, _ = RingAttention.prepare_varlen_batch(padding_mask, sp_group, input_embeds)
+    out, lse, _ = flash_attn_varlen_qkvpacked_func(
+        qkv,
+        mask_info["cu_seqlens"] * sp_size,
+        mask_info["max_seqlen"] * sp_size,
+        return_attn_probs=True,
+        causal=True,
+        # deterministic=True
+    )
+    # Test the splitting function
+    local_input = split_varlen_zigzag(
+        flat_input, mask_info["cu_seqlens"] * sp_size, sp_group, mask_info["max_seqlen"] * sp_size
+    )
+    assert (local_input == input_embeds.view(-1, nheads, d)[mask_info["valid_indices"]]).all()
+    del local_input, flat_input
+
+    q_ring, k_ring, v_ring = [input_embeds.clone().transpose(1, 2) for _ in range(3)]
+    q_ring.retain_grad()
+    k_ring.retain_grad()
+    v_ring.retain_grad()
+
+    ring_out, ring_lse = RingAttention.attention(
+        q_ring,
+        k_ring,
+        v_ring,
+        sp_group,
+        **mask_info,
+        pad_output=False,
+        return_softmax=True,
+        # deterministic=True
+    )
+    ring_out = ring_out.transpose(1, 2).reshape(-1, nheads, d)
+    # Check output
+    lse = lse.transpose(0, 1)
+    out, lse = split_varlen_zigzag(
+        [out, lse], mask_info["cu_seqlens"] * sp_size, sp_group, mask_info["max_seqlen"] * sp_size
+    )
+    assert_close(lse, ring_lse, atol=atol, rtol=rtol)
+    assert_close(out, ring_out, atol=atol, rtol=rtol)
+
+    # Check grads
+    labels = torch.ones(out.shape[0], dtype=dtype, device=device)
+    F.mse_loss(out.sum((-2, -1)), labels).backward()
+    F.mse_loss(ring_out.sum((-2, -1)), labels[: ring_out.shape[0]]).backward()
+    dq, dk, dv = [
+        split_varlen_zigzag(
+            qkv.grad[:, i], mask_info["cu_seqlens"] * sp_size, sp_group, mask_info["max_seqlen"] * sp_size
+        )
+        for i in range(3)
+    ]
+    dq_ring, dk_ring, dv_ring = [
+        x.transpose(1, 2).reshape(-1, nheads, d)[mask_info["valid_indices"]]
+        for x in (q_ring.grad, k_ring.grad, v_ring.grad)
+    ]
+
+    assert_close(dq, dq_ring, atol=atol, rtol=rtol)
+    assert_close(dk, dk_ring, atol=atol, rtol=rtol)
+    assert_close(dv, dv_ring, atol=atol, rtol=rtol)
+
+
+def launch_single_ring(rank, world_size, port):
+    colossalai.launch(rank, world_size, "localhost", port)
+    check_packed_seq()
+    check_ring_attn()
+
+
+def launch_double_ring(rank, world_size, port):
+    colossalai.launch(rank, world_size, "localhost", port)
+    check_ring_attn()
+
+
+@rerun_if_address_is_in_use()
+@parameterize("world_size", [2])
+def test_ring_attn(world_size):
+    spawn(launch_single_ring, nprocs=world_size)
+
+
+@rerun_if_address_is_in_use()
+@parameterize("world_size", [4])
+def test_double_ring(world_size):
+    spawn(launch_double_ring, nprocs=world_size)
+
+
+if __name__ == "__main__":
+    test_ring_attn()
+    test_double_ring()
diff --git a/tests/test_shardformer/test_model/_utils.py b/tests/test_shardformer/test_model/_utils.py
index 190fee12931b..9ad84341ac9e 100644
--- a/tests/test_shardformer/test_model/_utils.py
+++ b/tests/test_shardformer/test_model/_utils.py
@@ -10,6 +10,7 @@
 from torch.nn import Module
 from torch.optim import Adam, Optimizer
 from torch.testing import assert_close
+from transformers.modeling_outputs import BaseModelOutputWithPast
 
 from colossalai.accelerator import get_accelerator
 from colossalai.booster import Booster
@@ -259,7 +260,6 @@ def _criterion(outputs, inputs):
     org_output = org_model(**unshard_test_data)
     org_loss = criterion(org_output)
     org_loss.backward()
-
     return org_loss, org_output, sharded_loss, sharded_output
 
 
@@ -302,11 +302,12 @@ def _criterion(outputs, inputs):
 
 
 def check_output_hidden_state(
-    org_output: Tensor,
-    sharded_output: Tensor,
+    org_output: BaseModelOutputWithPast,
+    sharded_output: BaseModelOutputWithPast,
     stage_manager: Optional[PipelineStageManager] = None,
     atol: float = 1e-5,
     rtol: float = 1e-3,
+    shard_config: Optional[ShardConfig] = None,
 ):
     org_hidden_state = org_output.last_hidden_state
 
@@ -315,6 +316,14 @@ def check_output_hidden_state(
     else:
         sharded_hidden_state = sharded_output.last_hidden_state
 
+    # Check if the output sequence is gathered before cross entropy
+    if shard_config is not None:
+        seq_dim = 1
+        sp_group = shard_config.sequence_parallel_process_group
+        sp_size = shard_config.sequence_parallel_size
+        if org_hidden_state.shape[seq_dim] == sharded_hidden_state.shape[seq_dim] * sp_size:
+            org_hidden_state = org_hidden_state.chunk(sp_size, dim=seq_dim)[dist.get_rank(sp_group)]
+
     assert_close(org_hidden_state.float(), sharded_hidden_state.float(), atol=atol, rtol=rtol)
 
 
@@ -374,8 +383,11 @@ def get_grad_tensors_for_check(
             shard_grad = torch.cat(shard_grad_list, dim=dim)
 
         # embedding may be resized when using tensor parallel
-        if shard_grad.shape[0] > org_grad.shape[0]:
-            shard_grad = shard_grad[: org_grad.shape[0], :]
+        try:
+            if shard_grad.shape[0] > org_grad.shape[0]:
+                shard_grad = shard_grad[: org_grad.shape[0], :]
+        except:
+            pass
         if verbose and dist.get_rank() == 0:
             print(f"'{suffix}' grad: {org_grad}, {shard_grad}")
 
@@ -404,9 +416,6 @@ def check_grad(
         org_grad = getattr_(org_model, suffix).weight.grad
         shard_grad = getattr_(sharded_model, suffix).weight.grad
         shard_weight = getattr_(sharded_model, suffix).weight
-        # if verbose and dist.get_rank() == 0:
-        #     print("shard_weight", shard_weight)
-        #     print("org_grad", org_grad)
         if is_distributed_tensor(shard_weight) or is_customized_distributed_tensor(shard_weight):
             shard_grad_list = [torch.zeros_like(shard_grad).to("cuda") for _ in range(dist.get_world_size(tp_group))]
             dist.all_gather(shard_grad_list, shard_grad, tp_group)
@@ -440,7 +449,7 @@ def check_all_grad_tensors(check_tensors):
     "org_grad": tensor to be compared from the original model
     "shard_grad": tensor to be compared from the sharded model
     """
-    for suffix, check_info in check_tensors.items():
+    for idx, (suffix, check_info) in enumerate(check_tensors.items()):
         org_grad = check_info["org_grad"]
         shard_grad = check_info["shard_grad"]
         rtol = check_info["rtol"]
diff --git a/tests/test_shardformer/test_model/test_shard_command.py b/tests/test_shardformer/test_model/test_shard_command.py
index 3281b50e1d5d..efe5cee2a2b6 100644
--- a/tests/test_shardformer/test_model/test_shard_command.py
+++ b/tests/test_shardformer/test_model/test_shard_command.py
@@ -271,7 +271,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
     ],
 )
 def run_command_test(test_config):
-    sub_model_zoo = model_zoo.get_sub_registry("transformers_command", "transformers_command_for_casual_lm")
+    sub_model_zoo = model_zoo.get_sub_registry("transformers_command", "transformers_command_for_causal_lm")
 
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
         check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
@@ -321,7 +321,7 @@ def run_command_test(test_config):
     ],
 )
 def run_command_3d_test(test_config):
-    sub_model_zoo = model_zoo.get_sub_registry("transformers_command", "transformers_command_for_casual_lm")
+    sub_model_zoo = model_zoo.get_sub_registry("transformers_command", "transformers_command_for_causal_lm")
 
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
         check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
diff --git a/tests/test_shardformer/test_model/test_shard_llama.py b/tests/test_shardformer/test_model/test_shard_llama.py
index 88e54176b9fd..3c66f609787a 100644
--- a/tests/test_shardformer/test_model/test_shard_llama.py
+++ b/tests/test_shardformer/test_model/test_shard_llama.py
@@ -63,7 +63,9 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
         and booster.plugin.shard_config.sequence_parallelism_mode == "all_to_all"
     ):
         master2working = sharded_optimizer.get_master_to_working_map()
-        for p1, p2 in zip(llama_model.parameters(), sharded_optimizer._master_param_groups_of_current_rank[0]):
+        for (name, p1), p2 in zip(
+            llama_model.named_parameters(), sharded_optimizer._master_param_groups_of_current_rank[0]
+        ):
             working_p = master2working[id(p2)]
             grads = sharded_optimizer.get_partitioned_gradients_by_param_id(0, id(working_p))
             grad_index = (
@@ -73,7 +75,10 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             )
             grad = grads[grad_index]
             sharded_grad = p1.grad.view(-1).chunk(dist.get_world_size())[dist.get_rank()]
-            assert_close(sharded_grad, grad[: sharded_grad.shape[0]], atol=5e-3, rtol=5e-3, check_dtype=False)
+            try:
+                assert_close(sharded_grad, grad[: sharded_grad.shape[0]], atol=5e-3, rtol=5e-3, check_dtype=False)
+            except Exception as e:
+                raise RuntimeError(f"Failed to check grad for {name}") from e
 
     # Save gradient tensors for comparison between the original model and the sharded model before optimizer step.
     grads_to_check = {}
@@ -114,75 +119,103 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             atol, rtol = 5e-3, 5e-3
 
         if org_model.__class__.__name__ == "LlamaModel":
-            check_output_hidden_state(org_output, sharded_output, stage_manager, atol=atol, rtol=rtol)
-
+            check_output_hidden_state(
+                org_output,
+                sharded_output,
+                stage_manager,
+                atol=atol,
+                rtol=rtol,
+                shard_config=booster.plugin.shard_config,
+            )
         check_loss(org_loss, sharded_loss, atol=atol, rtol=rtol)
-
     # check weights
     if stage_manager is None or stage_manager.is_first_stage(ignore_chunk=True):
         if test_config["precision"] == "fp32":
             atol, rtol = 1e-4, 1e-3
         else:
             atol, rtol = 5e-3, 5e-3
-        try:
-            check_weight(
-                llama_model,
-                shard_llama_model,
-                col_layer_for_check,
-                tp_group,
-                atol=atol,
-                rtol=rtol,
-                dim=1,
-                verbose=False,
-            )
-        except Exception as e:
-            print(f"Failed config: {test_config}")
-            raise e
+        check_weight(
+            llama_model,
+            shard_llama_model,
+            col_layer_for_check,
+            tp_group,
+            atol=atol,
+            rtol=rtol,
+            dim=1,
+            verbose=False,
+        )
 
     # check grads
     check_all_grad_tensors(grads_to_check)
-
     torch.cuda.empty_cache()
 
 
 @parameterize(
     "test_config",
     [
-        {  # Ulysess + Flash attention
+        # Double Ring Attention
+        {
+            "tp_size": 1,
+            "pp_size": 1,
+            "sp_size": 4,
+            "num_microbatches": 1,
+            "enable_sequence_parallelism": True,
+            "sequence_parallelism_mode": "ring_attn",
+            "use_lazy_init": True,
+            "zero_stage": 0,
+            "precision": "fp16",
+            "initial_scale": 1,
+            "inner_ring_size": 2,
+        },
+        # Ring Attention + PP
+        {
             "tp_size": 1,
             "pp_size": 2,
             "sp_size": 2,
             "num_microbatches": 2,
             "enable_sequence_parallelism": True,
-            "sequence_parallelism_mode": "all_to_all",
-            "enable_flash_attention": True,
+            "sequence_parallelism_mode": "ring_attn",
             "use_lazy_init": True,
-            "zero_stage": 0,
+            "zero_stage": 1,
             "precision": "fp16",
             "initial_scale": 1,
         },
-        {  # Test ring + Flash attention
+        # Ring Attention + TP
+        {
             "tp_size": 2,
             "pp_size": 1,
             "sp_size": 2,
             "num_microbatches": 1,
             "enable_sequence_parallelism": True,
-            "sequence_parallelism_mode": "ring",
-            "enable_flash_attention": True,
+            "sequence_parallelism_mode": "ring_attn",
             "use_lazy_init": True,
             "zero_stage": 2,
             "precision": "fp16",
             "initial_scale": 1,
         },
-        {
-            "tp_size": 1,
+        {  # Ulysess + TP
+            "tp_size": 2,
             "pp_size": 1,
             "sp_size": 2,
             "num_microbatches": 1,
             "enable_sequence_parallelism": True,
             "sequence_parallelism_mode": "all_to_all",
+            "enable_all_optimization": True,
             "use_lazy_init": True,
-            "zero_stage": 1,
+            "zero_stage": 0,
+            "precision": "fp16",
+            "initial_scale": 1,
+        },
+        {  # Ulysess + PP
+            "tp_size": 1,
+            "pp_size": 2,
+            "sp_size": 2,
+            "num_microbatches": 2,
+            "enable_sequence_parallelism": True,
+            "sequence_parallelism_mode": "all_to_all",
+            "enable_all_optimization": True,
+            "use_lazy_init": True,
+            "zero_stage": 0,
             "precision": "fp16",
             "initial_scale": 1,
         },
@@ -192,8 +225,21 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "num_microbatches": 1,
             "enable_sequence_parallelism": True,
             "sequence_parallelism_mode": "split_gather",
-            "enable_flash_attention": False,
+            "enable_flash_attention": True,
+            "use_lazy_init": True,
+            "precision": "fp16",
+            "initial_scale": 1,
+        },
+        {
+            "tp_size": 2,
+            "pp_size": 1,
+            "sp_size": 1,
+            "num_microbatches": 1,
+            "enable_sequence_parallelism": True,
+            "sequence_parallelism_mode": "ring",
+            "enable_flash_attention": True,
             "use_lazy_init": True,
+            "zero_stage": 2,
             "precision": "fp16",
             "initial_scale": 1,
         },
@@ -240,12 +286,13 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
 )
 def run_llama_test(test_config):
     sub_model_zoo = model_zoo.get_sub_registry("transformers_llama")
-
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
+        if test_config.get("sequence_parallelism_mode", None) == "ring_attn" and "causal" not in name:
+            continue
         try:
             check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
         except Exception as e:
-            print(f"Failed config: {test_config}")
+            print(f"Failed config: {test_config}, model name: {name}")
             raise e
     clear_layout_converter()
     Randomizer.reset_index()

From 971b16a74f84415ac3cfde80d762149b2e65c036 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Thu, 22 Aug 2024 03:00:40 +0000
Subject: [PATCH 054/167] fix

---
 colossalai/booster/plugin/low_level_zero_plugin.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py
index 4188491c2e3c..4082ffada526 100644
--- a/colossalai/booster/plugin/low_level_zero_plugin.py
+++ b/colossalai/booster/plugin/low_level_zero_plugin.py
@@ -63,7 +63,7 @@ class OptimizerParamCheckState(enum.Enum):
 
 class LowLevelZeroModel(ModelWrapper, AMPModelMixin):
     def __init__(
-        self, module: nn.Module, precision: str, overlap_allgather: bool = False, use_fp8: bool = False
+        self, module: nn.Module, precision: str, overlap_allgather: bool = False, cast_inputs: bool = True, use_fp8: bool = False
     ) -> None:
         super().__init__(module)
         self.dtype = None
@@ -77,7 +77,7 @@ def __init__(
         self.module = module
         self.convert_fn = None
         self.use_fp8 = use_fp8
-        if self.dtype is not None:
+        if self.dtype is not None and cast_inputs:
             self.convert_fn = partial(_convert_floating_point, dtype=self.dtype)
         self.overlap_allgather = overlap_allgather
         self.op_hooks = []
@@ -342,6 +342,7 @@ def __init__(
         cpu_offload: bool = False,
         master_weights: bool = True,
         verbose: bool = False,
+        cast_inputs: bool = True,
         fp8_communication: bool = False,
         use_fp8: bool = False,
     ) -> None:
@@ -372,6 +373,8 @@ def __init__(
         self.lora_enabled = False
         self.verbose = verbose
         self.logger = get_dist_logger()
+        self.cast_inputs = cast_inputs
+
         self.use_fp8 = use_fp8
         # set class name with stage, for better error message
         setattr(self.__class__, "__name__", f"LowLevelZeroPlugin_ZeRO-{stage}")
@@ -490,6 +493,7 @@ def configure(
                 model,
                 self.precision,
                 overlap_allgather=self.zero_optim_kwargs["overlap_allgather"],
+                cast_inputs=self.cast_inputs,
                 use_fp8=self.use_fp8,
             )
 

From a29255417986bc3c938ab92d8fd22e8906aa0e92 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 22 Aug 2024 03:04:43 +0000
Subject: [PATCH 055/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 colossalai/booster/plugin/low_level_zero_plugin.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py
index 4082ffada526..42bb49bc9ca1 100644
--- a/colossalai/booster/plugin/low_level_zero_plugin.py
+++ b/colossalai/booster/plugin/low_level_zero_plugin.py
@@ -63,7 +63,12 @@ class OptimizerParamCheckState(enum.Enum):
 
 class LowLevelZeroModel(ModelWrapper, AMPModelMixin):
     def __init__(
-        self, module: nn.Module, precision: str, overlap_allgather: bool = False, cast_inputs: bool = True, use_fp8: bool = False
+        self,
+        module: nn.Module,
+        precision: str,
+        overlap_allgather: bool = False,
+        cast_inputs: bool = True,
+        use_fp8: bool = False,
     ) -> None:
         super().__init__(module)
         self.dtype = None

From 0bc9a870c0cd198819c49783c2bb4bdee81ed70a Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Fri, 23 Aug 2024 13:47:13 +0800
Subject: [PATCH 056/167] Update train_dpo.py

---
 .../ColossalChat/examples/training_scripts/train_dpo.py     | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/applications/ColossalChat/examples/training_scripts/train_dpo.py b/applications/ColossalChat/examples/training_scripts/train_dpo.py
index b8de6396f172..3b324ee784e0 100755
--- a/applications/ColossalChat/examples/training_scripts/train_dpo.py
+++ b/applications/ColossalChat/examples/training_scripts/train_dpo.py
@@ -279,10 +279,7 @@ def train(args):
         beta=args.beta,
         gamma=args.gamma,
         length_normalization=args.length_normalization,
-<<<<<<< HEAD
-=======
         apply_loss_mask=not args.disable_loss_mask,
->>>>>>> main
     )
 
     trainer.fit(
@@ -351,10 +348,7 @@ def train(args):
         default=False,
         help="Disable the reference model (enabled by default)",
     )
-<<<<<<< HEAD
-=======
     parser.add_argument("--disable_loss_mask", default=False, action="store_true")
->>>>>>> main
     parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision")
     parser.add_argument("--lora_config", type=str, default=None, help="low-rank adaptation config file path")
     parser.add_argument("--save_interval", type=int, default=1000, help="number of step between two checkpoints")

From 3b0df30362668c6e3abaa4edbf5ddb621155808d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 23 Aug 2024 05:48:11 +0000
Subject: [PATCH 057/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 colossalai/booster/plugin/low_level_zero_plugin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py
index 9a7534dded1c..42bb49bc9ca1 100644
--- a/colossalai/booster/plugin/low_level_zero_plugin.py
+++ b/colossalai/booster/plugin/low_level_zero_plugin.py
@@ -349,7 +349,7 @@ def __init__(
         verbose: bool = False,
         cast_inputs: bool = True,
         fp8_communication: bool = False,
-        use_fp8: bool = False
+        use_fp8: bool = False,
     ) -> None:
         super().__init__()
         assert stage in (1, 2), f"LowLevelZeroPlugin only supports stage 1/2 training"

From 9e767643dd28428f6f1f7e95be2f0a66cf4b2558 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Fri, 23 Aug 2024 13:49:53 +0800
Subject: [PATCH 058/167] Update low_level_zero_plugin.py

---
 colossalai/booster/plugin/low_level_zero_plugin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py
index 42bb49bc9ca1..9a7534dded1c 100644
--- a/colossalai/booster/plugin/low_level_zero_plugin.py
+++ b/colossalai/booster/plugin/low_level_zero_plugin.py
@@ -349,7 +349,7 @@ def __init__(
         verbose: bool = False,
         cast_inputs: bool = True,
         fp8_communication: bool = False,
-        use_fp8: bool = False,
+        use_fp8: bool = False
     ) -> None:
         super().__init__()
         assert stage in (1, 2), f"LowLevelZeroPlugin only supports stage 1/2 training"

From dae39999d7308894fa366ed466e1c956b1eb750a Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Mon, 26 Aug 2024 03:45:42 +0000
Subject: [PATCH 059/167] fix

---
 .../ColossalChat/examples/inference/round.txt | 104 ------------------
 colossalai/shardformer/layer/attn.py          |   4 -
 2 files changed, 108 deletions(-)
 delete mode 100644 applications/ColossalChat/examples/inference/round.txt

diff --git a/applications/ColossalChat/examples/inference/round.txt b/applications/ColossalChat/examples/inference/round.txt
deleted file mode 100644
index ba02074c1a03..000000000000
--- a/applications/ColossalChat/examples/inference/round.txt
+++ /dev/null
@@ -1,104 +0,0 @@
-
-
-==========
-round 1:
-<s>[INST] <<SYS>>
-A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
-
-
-<</SYS>>
-
-tell me a story [/INST] Great, let’s hear a story. </s>
-
-==========
-
-
-==========
-round 2:
-<s>[INST] <<SYS>>
-A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
-
-
-<</SYS>>
-
-tell me a story [/INST] Great, let’s hear a story. </s><s>[INST] calculate 1+1 [/INST] 1+1=2 </s>
-
-==========
-
-
-==========
-round 3:
-<s>[INST] <<SYS>>
-A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
-
-
-<</SYS>>
-
-tell me a story [/INST] Great, let’s hear a story. </s><s>[INST] calculate 1+1 [/INST] 1+1=2 </s><s>[INST] who is the first president of the USA [/INST] The first president of the United States was George Washington. </s>
-
-==========
-
-
-==========
-round 1:
-<s>[INST] <<SYS>>
-A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
-
-
-<</SYS>>
-
-who is the first president of the USA? [/INST] The first president of the United States was George Washington. </s>
-
-==========
-
-
-==========
-round 2:
-<s>[INST] <<SYS>>
-A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
-
-
-<</SYS>>
-
-who is the first president of the USA? [/INST] The first president of the United States was George Washington. </s><s>[INST] tell me a story [/INST] One story that might be interesting is the story of how the United States was founded.  In 1776, the Thirteen Colonies united together to form the new nation of America.  The first president of the new nation was George Washington.  The first state was Pennsylvania.  The first capital was Philadelphia.  The first presidential election was held in 1787, and was between George Read, a Federalist, and John Adams, a Republican.  The Federalists wanted to keep the power of the federal government limited, while the Republicans wanted the power to be spread around to the states.  The Federalists won, and George Washington was elected president. </s>
-
-==========
-
-
-==========
-round 1:
-<s>[INST] <<SYS>>
-A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
-
-
-<</SYS>>
-
-tell me a story [/INST] I do enjoy telling stories. Is there a particular story you’d like to hear? </s>
-
-==========
-
-
-==========
-round 2:
-<s>[INST] <<SYS>>
-A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
-
-
-<</SYS>>
-
-tell me a story [/INST] I do enjoy telling stories. Is there a particular story you’d like to hear? </s><s>[INST] about Donald Trump [/INST] I’d be happy to listen to a story about Donald Trump. I have no idea what a “story” is, but I do know a lot about human life and human behavior. I can try.  Do you have a theme or a question you’d like to ask about Donald Trump? </s>
-
-==========
-
-
-==========
-round 3:
-<s>[INST] <<SYS>>
-A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
-
-
-<</SYS>>
-
-tell me a story [/INST] I do enjoy telling stories. Is there a particular story you’d like to hear? </s><s>[INST] about Donald Trump [/INST] I’d be happy to listen to a story about Donald Trump. I have no idea what a “story” is, but I do know a lot about human life and human behavior. I can try.  Do you have a theme or a question you’d like to ask about Donald Trump? </s><s>[INST] Is Donald Trump the president of the United States [/INST] Yes, Donald Trump became the 45th president of the United States in January of 2016. </s>
-
-==========
diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index b102387a0840..5d1a30d8a4b6 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -165,10 +165,6 @@ def prepare_attn_kwargs(
             if s_q != 1:
                 attention_mask = attention_mask.tril(diagonal=0)
             attention_mask = attention_mask.expand(b, s_q, s_kv)
-            attention_mask = torch.ones(s_q, s_kv, dtype=dtype, device=device)
-            if s_q != 1:
-                attention_mask = attention_mask.tril(diagonal=0)
-            attention_mask = attention_mask.expand(b, s_q, s_kv)
         else:
             max_seqlen_q, cu_seqlens_q, q_indices = get_pad_info(q_padding_mask)
             if kv_padding_mask is None:

From 80d24ae519a85a8af147458fa1445049866d51bb Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 26 Aug 2024 03:48:42 +0000
Subject: [PATCH 060/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 colossalai/booster/plugin/low_level_zero_plugin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py
index 9a7534dded1c..42bb49bc9ca1 100644
--- a/colossalai/booster/plugin/low_level_zero_plugin.py
+++ b/colossalai/booster/plugin/low_level_zero_plugin.py
@@ -349,7 +349,7 @@ def __init__(
         verbose: bool = False,
         cast_inputs: bool = True,
         fp8_communication: bool = False,
-        use_fp8: bool = False
+        use_fp8: bool = False,
     ) -> None:
         super().__init__()
         assert stage in (1, 2), f"LowLevelZeroPlugin only supports stage 1/2 training"

From d383449fc4300ae3caf9cf481fc87bb4757f00a4 Mon Sep 17 00:00:00 2001
From: Wenxuan Tan <wenxuan.tan@wisc.edu>
Date: Tue, 27 Aug 2024 10:12:21 +0800
Subject: [PATCH 061/167] [CI] Remove triton version for compatibility bug;
 update req torch >=2.2 (#6018)

* remove triton version

* remove torch 2.2

* remove torch 2.1

* debug

* remove 2.1 build tests

* require torch >=2.2

---------

Co-authored-by: Edenzzzz <wtan45@wisc.edu>
---
 .compatibility                                  | 1 -
 .github/workflows/build_on_pr.yml               | 2 +-
 .github/workflows/build_on_schedule.yml         | 2 +-
 .github/workflows/doc_test_on_pr.yml            | 2 +-
 .github/workflows/doc_test_on_schedule.yml      | 2 +-
 .github/workflows/example_check_on_dispatch.yml | 2 +-
 .github/workflows/example_check_on_pr.yml       | 2 +-
 .github/workflows/example_check_on_schedule.yml | 2 +-
 .github/workflows/run_chatgpt_examples.yml      | 2 +-
 .github/workflows/run_chatgpt_unit_tests.yml    | 2 +-
 .github/workflows/run_colossalqa_unit_tests.yml | 2 +-
 README.md                                       | 2 +-
 requirements/requirements.txt                   | 2 +-
 13 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/.compatibility b/.compatibility
index 62d19faffa9e..e1836506aae6 100644
--- a/.compatibility
+++ b/.compatibility
@@ -1,4 +1,3 @@
-2.1.0-12.1.0
 2.2.2-12.1.0
 2.3.0-12.1.0
 2.4.0-12.4.1
diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
index 58cd8826809a..ceb33c9ac7a8 100644
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -89,7 +89,7 @@ jobs:
     if: needs.detect.outputs.anyLibraryFileChanged == 'true'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
       options: --gpus all --rm -v /dev/shm -v /data/scratch:/data/scratch
     timeout-minutes: 90
     defaults:
diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml
index fc688a71bd92..f8ca07d9731e 100644
--- a/.github/workflows/build_on_schedule.yml
+++ b/.github/workflows/build_on_schedule.yml
@@ -12,7 +12,7 @@ jobs:
     if: github.repository == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
       options: --gpus all --rm -v /dev/shm -v /data/scratch/:/data/scratch/
     timeout-minutes: 90
     steps:
diff --git a/.github/workflows/doc_test_on_pr.yml b/.github/workflows/doc_test_on_pr.yml
index 31c421846e2c..2e0ff6a59c74 100644
--- a/.github/workflows/doc_test_on_pr.yml
+++ b/.github/workflows/doc_test_on_pr.yml
@@ -56,7 +56,7 @@ jobs:
     needs: detect-changed-doc
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
       options: --gpus all --rm
     timeout-minutes: 30
     defaults:
diff --git a/.github/workflows/doc_test_on_schedule.yml b/.github/workflows/doc_test_on_schedule.yml
index e2491e4607f5..3ea6481f9980 100644
--- a/.github/workflows/doc_test_on_schedule.yml
+++ b/.github/workflows/doc_test_on_schedule.yml
@@ -12,7 +12,7 @@ jobs:
     name: Test the changed Doc
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
       options: --gpus all --rm
     timeout-minutes: 60
     steps:
diff --git a/.github/workflows/example_check_on_dispatch.yml b/.github/workflows/example_check_on_dispatch.yml
index d877b06cee1c..6a65c4ff5462 100644
--- a/.github/workflows/example_check_on_dispatch.yml
+++ b/.github/workflows/example_check_on_dispatch.yml
@@ -45,7 +45,7 @@ jobs:
       fail-fast: false
       matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}}
     container:
-      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
       options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm
     timeout-minutes: 15
     steps:
diff --git a/.github/workflows/example_check_on_pr.yml b/.github/workflows/example_check_on_pr.yml
index 7a906738cb96..af8da0383ebe 100644
--- a/.github/workflows/example_check_on_pr.yml
+++ b/.github/workflows/example_check_on_pr.yml
@@ -90,7 +90,7 @@ jobs:
       fail-fast: false
       matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}}
     container:
-      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
       options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm
     timeout-minutes: 30
     concurrency:
diff --git a/.github/workflows/example_check_on_schedule.yml b/.github/workflows/example_check_on_schedule.yml
index 6ec1b0591fc3..bc98e0b0ce5b 100644
--- a/.github/workflows/example_check_on_schedule.yml
+++ b/.github/workflows/example_check_on_schedule.yml
@@ -34,7 +34,7 @@ jobs:
       fail-fast: false
       matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
     container:
-      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
       options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm
     timeout-minutes: 30
     steps:
diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml
index b7522ffbdf74..262def229e73 100644
--- a/.github/workflows/run_chatgpt_examples.yml
+++ b/.github/workflows/run_chatgpt_examples.yml
@@ -19,7 +19,7 @@ jobs:
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
       options: --gpus all --rm -v /data/scratch/examples-data:/data/scratch/examples-data --shm-size=10.24gb
     timeout-minutes: 60
     defaults:
diff --git a/.github/workflows/run_chatgpt_unit_tests.yml b/.github/workflows/run_chatgpt_unit_tests.yml
index c0e74ecbbab0..21545098af74 100644
--- a/.github/workflows/run_chatgpt_unit_tests.yml
+++ b/.github/workflows/run_chatgpt_unit_tests.yml
@@ -19,7 +19,7 @@ jobs:
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
       options: --gpus all --rm -v /data/scratch/examples-data:/data/scratch/examples-data
     timeout-minutes: 30
     defaults:
diff --git a/.github/workflows/run_colossalqa_unit_tests.yml b/.github/workflows/run_colossalqa_unit_tests.yml
index 00944b92d9b6..326ef4526a43 100644
--- a/.github/workflows/run_colossalqa_unit_tests.yml
+++ b/.github/workflows/run_colossalqa_unit_tests.yml
@@ -19,7 +19,7 @@ jobs:
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
       volumes:
         - /data/scratch/test_data_colossalqa:/data/scratch/test_data_colossalqa
         - /data/scratch/llama-tiny:/data/scratch/llama-tiny
diff --git a/README.md b/README.md
index 69506e338f34..22c565b5058d 100644
--- a/README.md
+++ b/README.md
@@ -420,7 +420,7 @@ Please visit our [documentation](https://www.colossalai.org/) and [examples](htt
 ## Installation
 
 Requirements:
-- PyTorch >= 2.1
+- PyTorch >= 2.2
 - Python >= 3.7
 - CUDA >= 11.0
 - [NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) >= 7.0 (V100/RTX20 and higher)
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 578122d47072..b77a33b0a151 100644
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -8,7 +8,7 @@ click
 fabric
 contexttimer
 ninja
-torch>=2.1.0,<=2.4.0
+torch>=2.2.0,<=2.4.0
 safetensors
 einops
 pydantic

From cc1b0efc17d5be2367bb455cf0ac6d7805e068b1 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Wed, 28 Aug 2024 10:16:48 +0800
Subject: [PATCH 062/167] [plugin] hotfix zero plugin (#6036)

* [plugin] hotfix zero plugin

* [plugin] hotfix zero plugin
---
 colossalai/booster/plugin/low_level_zero_plugin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py
index 42bb49bc9ca1..8cc511a5610f 100644
--- a/colossalai/booster/plugin/low_level_zero_plugin.py
+++ b/colossalai/booster/plugin/low_level_zero_plugin.py
@@ -100,7 +100,7 @@ def forward(self, *args, **kwargs):
         if self.convert_fn is not None:
             args = tree_map(self.convert_fn, args)
             kwargs = tree_map(self.convert_fn, kwargs)
-        ctx = ColoParamOpHookManager.use_hooks(self.op_hook) if self.overlap_allgather else nullcontext()
+        ctx = ColoParamOpHookManager.use_hooks(*self.op_hooks) if self.overlap_allgather else nullcontext()
         with ctx:
             return super().forward(*args, **kwargs)
 

From 4a68efb7da274e1ab1550e3e018870e089d2a180 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Wed, 28 Aug 2024 17:01:58 +0800
Subject: [PATCH 063/167] [Colossal-LLaMA] Refactor latest APIs (#6030)

* refactor latest code

* update api

* add dummy dataset

* update Readme

* add setup

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update files

* add PP support

* update arguments

* update argument

* reorg folder

* update version

* remove IB infor

* update utils

* update readme

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update save for zero

* update save

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add apex

* update

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 applications/Colossal-LLaMA/README.md         |  36 +-
 .../colossal_llama/dataset/dummy_dataset.py   |  24 +
 .../utils/flash_attention_patch.py            | 352 --------------
 .../colossal_llama/utils/utils.py             |  36 ++
 .../{ => dataset}/prepare_pretrain_dataset.py |   0
 .../{ => dataset}/prepare_sft_dataset.py      |   0
 .../{ => inference}/inference_example.py      |   0
 .../{ => inference}/stream_chat_example.py    |   0
 applications/Colossal-LLaMA/requirements.txt  |   6 +-
 applications/Colossal-LLaMA/setup.py          |  37 ++
 applications/Colossal-LLaMA/train.example.sh  |  23 +-
 applications/Colossal-LLaMA/train.py          | 428 +++++++++++-------
 applications/Colossal-LLaMA/version.txt       |   2 +-
 13 files changed, 396 insertions(+), 548 deletions(-)
 create mode 100644 applications/Colossal-LLaMA/colossal_llama/dataset/dummy_dataset.py
 delete mode 100644 applications/Colossal-LLaMA/colossal_llama/utils/flash_attention_patch.py
 create mode 100644 applications/Colossal-LLaMA/colossal_llama/utils/utils.py
 rename applications/Colossal-LLaMA/{ => dataset}/prepare_pretrain_dataset.py (100%)
 rename applications/Colossal-LLaMA/{ => dataset}/prepare_sft_dataset.py (100%)
 rename applications/Colossal-LLaMA/{ => inference}/inference_example.py (100%)
 rename applications/Colossal-LLaMA/{ => inference}/stream_chat_example.py (100%)
 create mode 100644 applications/Colossal-LLaMA/setup.py

diff --git a/applications/Colossal-LLaMA/README.md b/applications/Colossal-LLaMA/README.md
index 5997008e8729..e62b14390787 100644
--- a/applications/Colossal-LLaMA/README.md
+++ b/applications/Colossal-LLaMA/README.md
@@ -30,7 +30,7 @@ Colossal-LLaMA
   - [Install](#install)
     - [0. Pre-requisite](#0-pre-requisite)
     - [1. Install required packages](#1-install-required-packages)
-    - [2. Install `xentropy`, `layer_norm` and `rotary`](#2-install-xentropy-layer_norm-and-rotary)
+    - [2. Install Apex](#2-install-apex)
   - [How to run](#how-to-run)
     - [1. Init Tokenizer Preparation](#1-init-tokenizer-preparation)
     - [2. Init Model Preparation](#2-init-model-preparation)
@@ -297,17 +297,13 @@ Here is details about CLI arguments:
 #### 1. Install required packages
 ```
 cd Colossal-LLaMA
-pip install -r requirements.txt
+pip install -e .
 ```
-#### 2. Install `xentropy`, `layer_norm` and `rotary`
+
+#### 2. Install Apex
 ```bash
-git clone git@github.com:Dao-AILab/flash-attention.git
-# At the root folder
-cd csrc/xentropy && pip install .
-# At the root folder
-cd csrc/layer_norm && pip install .
-# At the root folder
-cd csrc/rotary && pip install .
+git clone git@github.com:NVIDIA/apex.git
+# Install from source.
 ```
 
 ### How to run
@@ -427,25 +423,33 @@ Make sure master node can access all nodes (including itself) by ssh without pas
 Here is details about CLI arguments:
 * Pre-trained model path: `--pretrained`. Path to the pre-trained model in Hugging Face format.
 * Dataset path: `--dataset`. Path to the pre-tokenized dataset.
-* Booster plugin: `--plugin`. `gemini`, `gemini_auto`, `zero2`，`zero2_cpu` and `3d` are supported.For more details, please refer to [Booster plugins](https://colossalai.org/docs/basics/booster_plugins/).
+* Booster plugin: `--plugin`. `ddp`,`gemini`, `gemini_auto`, `zero2`，`zero2_cpu` and `3d` are supported.For more details, please refer to [Booster plugins](https://colossalai.org/docs/basics/booster_plugins/).
 * Intermediate checkpoint to load: `--load_checkpoint`. Path to the intermediate checkpoint. Saved checkpoint contains the states for `lr_scheduler`, `optimizer`,`running_states.json` and `modelling`. If `load_checkpoint` points to the `modelling` folder, only the model weights will be loaded without any other states to support multi-stage training.
 * Save interval: `--save_interval`. The interval (steps) of saving checkpoints. The default value is 1000.
 * Checkpoint directory: `--save_dir`. The directory path to save checkpoint and intermediate states. Intermediate states include `lr_scheduler`, `optimizer`,`running_states.json` and `modelling`.
 * Tensorboard directory: `--tensorboard_dir`. The path to save tensorboard logs.
 * Configuration file: `--config_file`. The path to save the configuration file.
 * Number of epochs: `--num_epochs`. Number of training epochs. The default value is 1.
-* Micro batch size: `--micro_batch_size`. Batch size per GPU. The default value is 1.
+* Batch size: `--batch_size`. Batch size per GPU. The default value is 1. For PP, it refers to number of samples per step.
 * Learning rate: `--lr`. The default value is 3e-4.
 * Max length: `--max_length`. Max context length. The default value is 4096.
 * Mixed precision: `--mixed_precision`. The default value is "fp16". "fp16" and "bf16" are supported.
 * Gradient clipping: `--gradient_clipping`. The default value is 1.0.
-* Weight decay: `-w`, `--weight_decay`. The default value is 0.1.
-* Warmup steps: `-s`, `--warmup_steps`. The default value is calculated by 0.025 warmup ratio.
+* Weight decay: `--weight_decay`. The default value is 0.1.
+* Warmup steps: `--warmup_steps`. The default value is calculated by 0.025 warmup ratio.
 * Gradient checkpointing: `--use_grad_checkpoint`. The default value is `False`. This saves memory at the cost of speed. You'd better enable this option when training with a large batch size.
 * Flash attention: `--use_flash_attn`. If you want to use flash attention, you must install `flash-attn` and related packages. The default value is `False`. This is helpful to accelerate training while saving memory. We recommend you always use flash attention.
 * Freeze non-embedding parameters: `--freeze_non_embeds_params`. Freeze non-embedding parameters. It can be helpful to align embeddings after extending vocabulary size.
-* Tensor parallelism size: `--tp`. TP size for 3d Parallelism. The default value is 1.
-* Zero stage: `--zero`. Zero stage for 3d Parallelism. The default value is 1.
+* Tensor parallelism size: `--tp`. TP size for 3d parallelism. The default value is 1. Used for 3d plugin.
+* Pipeline parallelism size: `--pp`. PP size for 3d parallelism. The default value is 1. Used for 3d plugin.
+* Sequence parallelism size: `--sp`. SP size for 3d parallelism. The default value is 1. Used for 3d plugin.
+* Zero stage: `--zero`. Zero stage for 3d Parallelism. The default value is 1. Used for 3d plugin.
+* Sequence parallelism mode: `--sp_mode`. SP mode, used for 3d plugin. Choose from "split_gather", "ring", "all_to_all".
+* Switch for sequence parallelism: `--enable_sequence_parallelism`. Whether to enable SP, used for 3d plugin.
+* Zero CPU offload: `--zero_cpu_offload`. Whether to use offloading, used for 3d plugin.
+* Micro batch size: `--microbatch_size`. Batch size for each process in PP, used for 3d plugin.
+* Number of dummy sample: `--num_samples`. Number of samples for benchmarking.
+* Benchmark switch: `--benchmark`. Benchmark performance using random dataset.
 
 ##### 4.2 Arguments for Supervised Fine-tuning
 We add support for gradient accumulation and NEFTuning for supervised fine-tuning and thus there are two more arguments apart from the arguments listed in [4.1 Arguments for Pretraining](#41-arguments-for-pretraining).
diff --git a/applications/Colossal-LLaMA/colossal_llama/dataset/dummy_dataset.py b/applications/Colossal-LLaMA/colossal_llama/dataset/dummy_dataset.py
new file mode 100644
index 000000000000..3175159fcd37
--- /dev/null
+++ b/applications/Colossal-LLaMA/colossal_llama/dataset/dummy_dataset.py
@@ -0,0 +1,24 @@
+import torch
+from torch.utils.data import Dataset
+
+from colossalai.accelerator import get_accelerator
+
+
+class RandomDataset(Dataset):
+    def __init__(self, num_samples: int = 1000, max_length: int = 2048, vocab_size: int = 32000):
+        self.num_samples = num_samples
+        self.max_length = max_length
+        self.input_ids = torch.randint(
+            0, vocab_size, (num_samples, max_length), device=get_accelerator().get_current_device()
+        )
+        self.attention_mask = torch.ones_like(self.input_ids)
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        return {
+            "input_ids": self.input_ids[idx],
+            "attention_mask": self.attention_mask[idx],
+            "labels": self.input_ids[idx],
+        }
diff --git a/applications/Colossal-LLaMA/colossal_llama/utils/flash_attention_patch.py b/applications/Colossal-LLaMA/colossal_llama/utils/flash_attention_patch.py
deleted file mode 100644
index 6c048c3b18cf..000000000000
--- a/applications/Colossal-LLaMA/colossal_llama/utils/flash_attention_patch.py
+++ /dev/null
@@ -1,352 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-import math
-from types import MethodType
-from typing import Optional, Tuple
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from transformers.models.llama.configuration_llama import LlamaConfig
-from transformers.models.llama.modeling_llama import (
-    LlamaAttention,
-    LlamaForCausalLM,
-    LlamaModel,
-    LlamaRMSNorm,
-    apply_rotary_pos_emb,
-    repeat_kv,
-)
-
-from colossalai.accelerator import get_accelerator
-from colossalai.logging import get_dist_logger
-
-logger = get_dist_logger()
-
-if get_accelerator().name == "cuda":
-    from flash_attn.bert_padding import pad_input, unpad_input
-    from flash_attn.flash_attn_interface import flash_attn_func, flash_attn_varlen_kvpacked_func
-    from flash_attn.ops.rms_norm import rms_norm
-
-    def _prepare_decoder_attention_mask(
-        self: LlamaModel,
-        attention_mask: torch.BoolTensor,
-        input_shape: torch.Size,
-        inputs_embeds: torch.Tensor,
-        past_key_values_length: int,
-    ) -> Optional[torch.Tensor]:
-        """
-        Decoder attetion mask
-        """
-        if past_key_values_length > 0 and attention_mask is not None:
-            attention_mask = torch.cat(
-                tensors=(
-                    torch.full(
-                        size=(input_shape[0], past_key_values_length),
-                        fill_value=True,
-                        dtype=attention_mask.dtype,
-                        device=attention_mask.device,
-                    ),
-                    attention_mask,
-                ),
-                dim=-1,
-            )  # (bsz, past_key_values_length + q_len)
-        if attention_mask is not None and torch.all(attention_mask):
-            return None  # Faster
-        return attention_mask
-
-    def attention_forward(
-        self: LlamaAttention,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """
-        Re-define LLaMA-2 `LlamaAttention` forward method using flash-attention.
-        """
-        if output_attentions:
-            logger.warning(
-                "Argument `output_attentions` is not supported for flash-attention patched `LlamaAttention`, "
-                "return `None` instead."
-            )
-
-        bsz, q_len, _ = hidden_states.size()
-
-        if self.config.pretraining_tp > 1:
-            q_slicing, kv_slicing = (
-                dim // self.config.pretraining_tp
-                for dim in (
-                    self.num_heads * self.head_dim,
-                    self.num_key_value_heads * self.head_dim,
-                )
-            )  # `Tuple[int, int]`
-            q_slices, k_slices, v_slices = (
-                proj.weight.split(slicing, dim=0)
-                for proj, slicing in (
-                    (self.q_proj, q_slicing),
-                    (self.k_proj, kv_slicing),
-                    (self.v_proj, kv_slicing),
-                )
-            )  # Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor], Tuple[torch.Tensor]]
-            q, k, v = (
-                torch.cat(
-                    [F.linear(hidden_states, slices[i]) for i in range(self.config.pretraining_tp)],
-                    dim=-1,
-                )
-                for slices in (q_slices, k_slices, v_slices)
-            )
-            # `Tuple[torch.Tensor, torch.Tensor, torch.Tensor]` of shape:
-            # (bsz, q_len, num_heads * head_dim),
-            # (bsz, q_len, num_key_value_heads * head_dim),
-            # (bsz, q_len, num_key_value_heads * head_dim)
-        else:
-            q, k, v = (proj(hidden_states) for proj in (self.q_proj, self.k_proj, self.v_proj))
-            # `Tuple[torch.Tensor, torch.Tensor, torch.Tensor]` of shape:
-            # (bsz, q_len, num_heads * head_dim),
-            # (bsz, q_len, num_key_value_heads * head_dim),
-            # (bsz, q_len, num_key_value_heads * head_dim)
-
-        # (bsz, q_len, num_heads * head_dim) -> (bsz, num_heads, q_len, head_dim);
-        # (bsz, q_len, num_key_value_heads * head_dim) -> (bsz, num_key_value_heads, q_len, head_dim);
-        # (bsz, q_len, num_key_value_heads * head_dim) -> (bsz, num_key_value_heads, q_len, head_dim)
-        q, k, v = (
-            states.view(bsz, q_len, num_heads, self.head_dim).transpose(1, 2)
-            for states, num_heads in (
-                (q, self.num_heads),
-                (k, self.num_key_value_heads),
-                (v, self.num_key_value_heads),
-            )
-        )
-        kv_len = k.shape[-2]  # initially, `kv_len` == `q_len`
-        past_kv_len = 0
-        if past_key_value is not None:
-            # if `past_key_value` is not None, `kv_len` > `q_len`.
-            past_kv_len = past_key_value[0].shape[-2]
-            kv_len += past_kv_len
-
-        # two `torch.Tensor` objs of shape (1, 1, kv_len, head_dim)
-        cos, sin = self.rotary_emb(v, seq_len=kv_len)
-        # (bsz, num_heads, q_len, head_dim), (bsz, num_key_value_heads, q_len, head_dim)
-        q, k = apply_rotary_pos_emb(q=q, k=k, cos=cos, sin=sin, position_ids=position_ids)
-        if past_key_value is not None:
-            # reuse k, v, self_attention
-            k = torch.cat([past_key_value[0], k], dim=2)
-            v = torch.cat([past_key_value[1], v], dim=2)
-
-        past_key_value = (k, v) if use_cache else None
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        k = repeat_kv(hidden_states=k, n_rep=self.num_key_value_groups)
-        # (bsz, num_key_value_heads, q_len, head_dim) -> (bsz, num_heads, q_len, head_dim)
-        v = repeat_kv(hidden_states=v, n_rep=self.num_key_value_groups)
-        # (bsz, num_key_value_heads, q_len, head_dim) -> (bsz, num_heads, q_len, head_dim)
-
-        key_padding_mask = attention_mask
-        # (bsz, num_heads, q_len, head_dim) -> (bsz, q_len, num_heads, head_dim)
-        q, k, v = (states.transpose(1, 2) for states in (q, k, v))
-
-        if past_kv_len > 0:
-            q = torch.cat(
-                tensors=(
-                    torch.full(
-                        size=(bsz, past_kv_len, self.num_heads, self.head_dim),
-                        fill_value=0.0,
-                        dtype=q.dtype,
-                        device=q.device,
-                    ),
-                    q,
-                ),
-                dim=1,
-            )  # (bsz, past_kv_len + q_len, num_heads, head_dim)
-
-        if key_padding_mask is None:
-            # (bsz, past_kv_len + q_len, num_heads, head_dim)
-            output = flash_attn_func(q=q, k=k, v=v, dropout_p=0.0, softmax_scale=None, causal=True)  # (bsz, )
-            output = rearrange(
-                output, pattern="... h d -> ... (h d)"
-            )  # (bsz, past_kv_len + q_len, num_heads * head_dim)
-        else:
-            q, indices, cu_q_lens, max_q_len = unpad_input(hidden_states=q, attention_mask=key_padding_mask)
-            kv, _, cu_kv_lens, max_kv_len = unpad_input(
-                hidden_states=torch.stack(tensors=(k, v), dim=2),
-                attention_mask=key_padding_mask,
-            )
-            output_unpad = flash_attn_varlen_kvpacked_func(
-                q=q,
-                kv=kv,
-                cu_seqlens_q=cu_q_lens,
-                cu_seqlens_k=cu_kv_lens,
-                max_seqlen_q=max_q_len,
-                max_seqlen_k=max_kv_len,
-                dropout_p=0.0,
-                softmax_scale=None,
-                causal=True,
-            )
-            output = pad_input(
-                hidden_states=rearrange(output_unpad, pattern="nnz h d -> nnz (h d)"),
-                indices=indices,
-                batch=bsz,
-                seqlen=past_kv_len + q_len,
-            )  # (bsz, past_kv_len + q_len, num_heads * head_dim)
-
-        if past_kv_len > 0:
-            # Strip off the zero query outputs.
-            output = output[:, past_kv_len:, ...]  # (bsz, q_len, num_heads * head_dim)
-        output = self.o_proj(output)  # (bsz, q_len, hidden_size)
-        return output, None, past_key_value
-
-    def rms_norm_forward(self: LlamaRMSNorm, hidden_states: torch.Tensor) -> torch.Tensor:
-        """
-        Formard function for RMS Norm
-        """
-        return rms_norm(x=hidden_states, weight=self.weight, epsilon=self.variance_epsilon)
-
-    def replace_with_flash_attention(model: LlamaForCausalLM) -> None:
-        for name, module in model.named_modules():
-            if isinstance(module, LlamaAttention):
-                module.forward = MethodType(attention_forward, module)
-            if isinstance(module, LlamaModel):
-                module._prepare_decoder_attention_mask = MethodType(_prepare_decoder_attention_mask, module)
-            if isinstance(module, LlamaRMSNorm):
-                module.forward = MethodType(rms_norm_forward, module)
-
-elif get_accelerator().name == "npu":
-    import torch_npu
-
-    class NPULlamaAttention(LlamaAttention):
-        use_flash: bool = True
-
-        def __init__(self, config: LlamaConfig):
-            super().__init__(config)
-            self.setup()
-
-        def setup(self):
-            self._softmax_scale = 1 / math.sqrt(self.head_dim)
-
-        def forward(
-            self,
-            hidden_states: torch.Tensor,
-            attention_mask: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.LongTensor] = None,
-            past_key_value: Optional[Tuple[torch.Tensor]] = None,
-            output_attentions: bool = False,
-            use_cache: bool = False,
-        ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-            bsz, q_len, _ = hidden_states.size()
-
-            if self.config.pretraining_tp > 1:
-                key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
-                query_slices = self.q_proj.weight.split(
-                    (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
-                )
-                key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
-                value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
-
-                query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
-                query_states = torch.cat(query_states, dim=-1)
-
-                key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
-                key_states = torch.cat(key_states, dim=-1)
-
-                value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
-                value_states = torch.cat(value_states, dim=-1)
-
-            else:
-                query_states = self.q_proj(hidden_states)
-                key_states = self.k_proj(hidden_states)
-                value_states = self.v_proj(hidden_states)
-
-            query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-            key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-            value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-            kv_seq_len = key_states.shape[-2]
-            if past_key_value is not None:
-                kv_seq_len += past_key_value[0].shape[-2]
-            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-            if past_key_value is not None:
-                # reuse k, v, self_attention
-                key_states = torch.cat([past_key_value[0], key_states], dim=2)
-                value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-            past_key_value = (key_states, value_states) if use_cache else None
-
-            key_states = repeat_kv(key_states, self.num_key_value_groups)
-            value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-            if not self.use_flash:
-                attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-                if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-                    raise ValueError(
-                        f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                        f" {attn_weights.size()}"
-                    )
-
-                if attention_mask is not None:
-                    if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                        raise ValueError(
-                            f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                        )
-                    attn_weights = attn_weights + attention_mask
-
-                # upcast attention to fp32
-                attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-                attn_output = torch.matmul(attn_weights, value_states)
-            else:
-                attn_output, *_ = torch_npu.npu_fusion_attention(
-                    query_states,
-                    key_states,
-                    value_states,
-                    self.num_heads,
-                    "BNSD",
-                    atten_mask=attention_mask.bool(),
-                    scale=self._softmax_scale,
-                    padding_mask=None,
-                    pre_tockens=65535,
-                    next_tockens=0,
-                    keep_prob=1.0,
-                    inner_precise=0,
-                )
-
-            if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-                raise ValueError(
-                    f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                    f" {attn_output.size()}"
-                )
-
-            attn_output = attn_output.transpose(1, 2).contiguous()
-            attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-            if self.config.pretraining_tp > 1:
-                attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
-                o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
-                attn_output = sum(
-                    [F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)]
-                )
-            else:
-                attn_output = self.o_proj(attn_output)
-
-            if not output_attentions:
-                attn_weights = None
-
-            return attn_output, attn_weights, past_key_value
-
-    class NPURMSNorm(LlamaRMSNorm):
-        def forward(self, hidden_states):
-            return torch_npu.npu_rms_norm(hidden_states, self.weight, epsilon=self.variance_epsilon)[0]
-
-    def replace_with_flash_attention(model: LlamaForCausalLM) -> None:
-        for name, module in model.named_modules():
-            if isinstance(module, LlamaAttention):
-                module.__class__ = NPULlamaAttention
-                module.setup()
-            if isinstance(module, LlamaRMSNorm):
-                module.__class__ = NPURMSNorm
diff --git a/applications/Colossal-LLaMA/colossal_llama/utils/utils.py b/applications/Colossal-LLaMA/colossal_llama/utils/utils.py
new file mode 100644
index 000000000000..f24ab72c47c9
--- /dev/null
+++ b/applications/Colossal-LLaMA/colossal_llama/utils/utils.py
@@ -0,0 +1,36 @@
+"""
+Utils for Colossal-LLaMA
+"""
+
+import torch
+import torch.distributed as dist
+
+from colossalai.booster import Plugin
+
+
+def all_reduce_mean(tensor: torch.Tensor, plugin: Plugin = None) -> torch.Tensor:
+    if plugin is not None:
+        dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM, group=plugin.dp_group)
+        tensor.div_(plugin.dp_size)
+    else:
+        dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM)
+        tensor.div_(dist.get_world_size())
+    return tensor
+
+
+def get_model_numel(model: torch.nn.Module) -> int:
+    return sum(p.numel() for p in model.parameters())
+
+
+def format_numel_str(numel: int) -> str:
+    B = 1024**3
+    M = 1024**2
+    K = 1024
+    if numel >= B:
+        return f"{numel / B:.2f} B"
+    elif numel >= M:
+        return f"{numel / M:.2f} M"
+    elif numel >= K:
+        return f"{numel / K:.2f} K"
+    else:
+        return f"{numel}"
diff --git a/applications/Colossal-LLaMA/prepare_pretrain_dataset.py b/applications/Colossal-LLaMA/dataset/prepare_pretrain_dataset.py
similarity index 100%
rename from applications/Colossal-LLaMA/prepare_pretrain_dataset.py
rename to applications/Colossal-LLaMA/dataset/prepare_pretrain_dataset.py
diff --git a/applications/Colossal-LLaMA/prepare_sft_dataset.py b/applications/Colossal-LLaMA/dataset/prepare_sft_dataset.py
similarity index 100%
rename from applications/Colossal-LLaMA/prepare_sft_dataset.py
rename to applications/Colossal-LLaMA/dataset/prepare_sft_dataset.py
diff --git a/applications/Colossal-LLaMA/inference_example.py b/applications/Colossal-LLaMA/inference/inference_example.py
similarity index 100%
rename from applications/Colossal-LLaMA/inference_example.py
rename to applications/Colossal-LLaMA/inference/inference_example.py
diff --git a/applications/Colossal-LLaMA/stream_chat_example.py b/applications/Colossal-LLaMA/inference/stream_chat_example.py
similarity index 100%
rename from applications/Colossal-LLaMA/stream_chat_example.py
rename to applications/Colossal-LLaMA/inference/stream_chat_example.py
diff --git a/applications/Colossal-LLaMA/requirements.txt b/applications/Colossal-LLaMA/requirements.txt
index 809a942ac398..5b62926f616d 100644
--- a/applications/Colossal-LLaMA/requirements.txt
+++ b/applications/Colossal-LLaMA/requirements.txt
@@ -1,15 +1,15 @@
 torch==2.1.2
 huggingface-hub
 packaging==24.0
-colossalai==0.3.6
+colossalai>=0.4.0
 autoflake==2.2.1
 black==23.9.1
-transformers==4.34.1
+transformers>=4.39.3
 tensorboard==2.14.0
 six==1.16.0
 datasets
 ninja==1.11.1
-flash-attn>=2.0.0,<=2.0.5
+flash-attn
 tqdm
 sentencepiece==0.1.99
 protobuf<=3.20.0
diff --git a/applications/Colossal-LLaMA/setup.py b/applications/Colossal-LLaMA/setup.py
new file mode 100644
index 000000000000..c9ba31698218
--- /dev/null
+++ b/applications/Colossal-LLaMA/setup.py
@@ -0,0 +1,37 @@
+from setuptools import find_packages, setup
+
+
+def fetch_requirements(path):
+    with open(path, "r") as fd:
+        return [r.strip() for r in fd.readlines()]
+
+
+def fetch_readme():
+    with open("README.md", encoding="utf-8") as f:
+        return f.read()
+
+
+def fetch_version():
+    with open("version.txt", "r") as f:
+        return f.read().strip()
+
+
+setup(
+    name="colossal_llama",
+    version=fetch_version(),
+    packages=find_packages(exclude=("*.egg-info",)),
+    description="Continual Pre-training and SFT for LLaMA",
+    long_description=fetch_readme(),
+    long_description_content_type="text/markdown",
+    license="Apache Software License 2.0",
+    url="https://github.com/hpcaitech/ColossalAI/tree/main/applications/Colossal-LLaMA",
+    install_requires=fetch_requirements("requirements.txt"),
+    python_requires=">=3.7",
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: Apache Software License",
+        "Environment :: GPU :: NVIDIA CUDA",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: System :: Distributed Computing",
+    ],
+)
diff --git a/applications/Colossal-LLaMA/train.example.sh b/applications/Colossal-LLaMA/train.example.sh
index 6a1c887bf6cc..b795e8bcf810 100644
--- a/applications/Colossal-LLaMA/train.example.sh
+++ b/applications/Colossal-LLaMA/train.example.sh
@@ -1,13 +1,20 @@
 #!/bin/bash
+set_n_least_used_CUDA_VISIBLE_DEVICES() {
+    local n=${1:-"9999"}
+    echo "GPU Memory Usage:"
+    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
+        tail -n +2 |
+        nl -v 0 |
+        tee /dev/tty |
+        sort -g -k 2 |
+        awk '{print $1}' |
+        head -n $n)
+    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
+    echo "Now CUDA_VISIBLE_DEVICES is set to:"
+    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+}
 
-# NCCL IB environment variables
-export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1
-export NCCL_IB_DISABLE=0
-export NCCL_SOCKET_IFNAME=eth0
-export NCCL_IB_GID_INDEX=3
-export NCCL_IB_TIMEOUT=23
-export NCCL_IB_RETRY_CNT=7
-export OMP_NUM_THREADS=8
+set_n_least_used_CUDA_VISIBLE_DEVICES 8
 
 PROJECT_NAME=""
 PARENT_SAVE_DIR=""
diff --git a/applications/Colossal-LLaMA/train.py b/applications/Colossal-LLaMA/train.py
index e74aad33c3e3..112a1e0dc223 100644
--- a/applications/Colossal-LLaMA/train.py
+++ b/applications/Colossal-LLaMA/train.py
@@ -11,24 +11,24 @@
 from contextlib import nullcontext
 
 import torch
-import torch.distributed as dist
+from colossal_llama.dataset.dummy_dataset import RandomDataset
 from colossal_llama.dataset.loader import (
     DataCollatorForSupervisedDataset,
     StatefulDistributedSampler,
     load_tokenized_dataset,
 )
 from colossal_llama.utils.ckpt_io import load_checkpoint, save_checkpoint
-from colossal_llama.utils.flash_attention_patch import replace_with_flash_attention
 from colossal_llama.utils.froze import freeze_non_embeds_parameters
 from colossal_llama.utils.neftune_patch import activate_neftune, deactivate_neftune
+from colossal_llama.utils.utils import all_reduce_mean, format_numel_str, get_model_numel
 from torch.utils.tensorboard import SummaryWriter
 from tqdm import tqdm
-from transformers import AutoTokenizer, LlamaForCausalLM
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 import colossalai
 from colossalai.accelerator import get_accelerator
 from colossalai.booster import Booster
-from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
+from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin, TorchDDPPlugin
 from colossalai.cluster import DistCoordinator
 from colossalai.lazy import LazyInitContext
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
@@ -36,109 +36,7 @@
 from colossalai.utils import get_current_device
 
 
-def get_model_numel(model: torch.nn.Module) -> int:
-    return sum(p.numel() for p in model.parameters())
-
-
-def format_numel_str(numel: int) -> str:
-    B = 1024**3
-    M = 1024**2
-    K = 1024
-    if numel >= B:
-        return f"{numel / B:.2f} B"
-    elif numel >= M:
-        return f"{numel / M:.2f} M"
-    elif numel >= K:
-        return f"{numel / K:.2f} K"
-    else:
-        return f"{numel}"
-
-
-def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor:
-    dist.all_reduce(tensor=tensor, op=dist.ReduceOp.SUM)
-    tensor = tensor.data
-    tensor.div_(dist.get_world_size())
-    return tensor
-
-
-def main() -> None:
-    # ==============================
-    # Parse Arguments
-    # ==============================
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--pretrained",
-        type=str,
-        default=None,
-        help="Address of the pre-trained modeling",
-    )
-    parser.add_argument("--dataset", nargs="+", default=[])
-    parser.add_argument(
-        "--plugin",
-        type=str,
-        default="gemini",
-        choices=["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d"],
-        help="Choose which plugin to use",
-    )
-    parser.add_argument("--load_checkpoint", type=str, default=None, help="Load checkpoint")
-    parser.add_argument("--save_interval", type=int, default=1000, help="Save interval")
-    parser.add_argument("--save_dir", type=str, default="checkpoint_dir", help="Checkpoint directory")
-    parser.add_argument("--tensorboard_dir", type=str, default="logs_dir", help="Tensorboard directory")
-    parser.add_argument("--config_file", type=str, default="config_file", help="Config file")
-    parser.add_argument("--num_epochs", type=int, default=1, help="Number of training epochs")
-    parser.add_argument("--accumulation_steps", type=int, default=1, help="Number of accumulation steps")
-    parser.add_argument("--micro_batch_size", type=int, default=2, help="Batch size of each process")
-    parser.add_argument("--lr", type=float, default=3e-4, help="Learning rate")
-    parser.add_argument("--max_length", type=int, default=8192, help="Model max length")
-    parser.add_argument(
-        "--mixed_precision",
-        type=str,
-        default="fp16",
-        choices=["fp16", "bf16"],
-        help="Mixed precision",
-    )
-    parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value")
-    parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay")
-    parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps")
-    parser.add_argument(
-        "--use_grad_checkpoint",
-        action="store_true",
-        default=False,
-        help="Use gradient checkpointing",
-    )
-    parser.add_argument(
-        "--use_flash_attn",
-        action="store_true",
-        default=False,
-        help="Use flash-attention",
-    )
-    parser.add_argument(
-        "--use_neft",
-        action="store_true",
-        default=False,
-        help="Use NEFTune",
-    )
-    parser.add_argument(
-        "--freeze_non_embeds_params",
-        action="store_true",
-        default=False,
-        help="Freeze non embeddings parameters",
-    )
-    parser.add_argument("--tp", type=int, default=1)
-    parser.add_argument("--zero", type=int, default=1)
-    parser.add_argument("--pad_token", choices=["eos", "unk"], default="eos")
-    parser.add_argument("--padding_mode", choices=["max_length", "longest"], default="max_length")
-    parser.add_argument(
-        "--skip_save_each_epoch",
-        action="store_true",
-        default=False,
-        help="skip saving the model checkpoint after each epoch is completed.",
-    )
-    args = parser.parse_args()
-
-    with open(args.config_file, "w") as f:
-        json.dump(args.__dict__, f, indent=4)
-
+def train(args) -> None:
     # ==============================
     # Initialize Distributed Training
     # ==============================
@@ -147,21 +45,27 @@ def main() -> None:
     coordinator = DistCoordinator()
 
     # ==============================
-    # Initialize Tensorboard
+    # Initialize Tensorboard and Save Config
     # ==============================
     if coordinator.is_master():
         os.makedirs(args.tensorboard_dir, exist_ok=True)
         writer = SummaryWriter(args.tensorboard_dir)
 
+        with open(args.config_file, "w") as f:
+            json.dump(args.__dict__, f, indent=4)
+
     # ==============================
     # Initialize Booster
     # ==============================
-    if args.plugin == "gemini":
+    if args.plugin == "ddp":
+        plugin = TorchDDPPlugin(find_unused_parameters=True if args.use_grad_checkpoint is False else False)
+    elif args.plugin == "gemini":
         plugin = GeminiPlugin(
             precision=args.mixed_precision,
             initial_scale=2**16,
             max_norm=args.grad_clip,
             enable_gradient_accumulation=(args.accumulation_steps > 1),
+            enable_flash_attention=args.use_flash_attn,
         )
     elif args.plugin == "gemini_auto":
         plugin = GeminiPlugin(
@@ -170,6 +74,7 @@ def main() -> None:
             initial_scale=2**16,
             max_norm=args.grad_clip,
             enable_gradient_accumulation=(args.accumulation_steps > 1),
+            enable_flash_attention=args.use_flash_attn,
         )
     elif args.plugin == "zero2":
         plugin = LowLevelZeroPlugin(
@@ -189,10 +94,17 @@ def main() -> None:
     elif args.plugin == "3d":
         plugin = HybridParallelPlugin(
             tp_size=args.tp,
-            pp_size=1,
-            zero_stage=args.zero,
+            pp_size=args.pp,
+            sp_size=args.sp,
+            sequence_parallelism_mode=args.sp_mode,
+            zero_stage=args.zero_stage,
+            enable_flash_attention=args.use_flash_attn,
+            enable_sequence_parallelism=args.enable_sequence_parallelism,
+            cpu_offload=True if args.zero_stage >= 1 and args.zero_cpu_offload else False,
+            parallel_output=False,
             max_norm=args.grad_clip,
             precision=args.mixed_precision,
+            microbatch_size=args.microbatch_size,
         )
     else:
         raise ValueError(f"Unknown plugin {args.plugin}")
@@ -210,24 +122,38 @@ def main() -> None:
     tokenizer.add_bos_token = False
     tokenizer.add_eos_token = False
 
-    coordinator.print_on_master(f"Configuration file will be saved at: {args.config_file}")
-    coordinator.print_on_master(f"Tensorboard logs will be saved at: {args.tensorboard_dir}")
-    coordinator.print_on_master(f"Model checkpoint will be saved at: {args.save_dir}")
+    coordinator.print_on_master(
+        f"Training Info:\nConfig file: {args.config_file} \nTensorboard logs: {args.tensorboard_dir} \nModel checkpoint: {args.save_dir}"
+    )
 
-    coordinator.print_on_master(f"Load dataset: {args.dataset}")
+    if args.benchmark:
+        coordinator.print_on_master(f"Run benchmark with {args.num_samples} random samples.")
+        dataset = RandomDataset(
+            num_samples=args.num_samples, max_length=args.max_length, vocab_size=tokenizer.vocab_size
+        )
+        dataloader = plugin.prepare_dataloader(
+            dataset,
+            batch_size=args.batch_size,
+            shuffle=True,
+            drop_last=True,
+            seed=42,
+            distributed_sampler_cls=StatefulDistributedSampler,
+        )
+    else:
+        coordinator.print_on_master(f"Load dataset: {args.dataset}")
+        dataset = load_tokenized_dataset(dataset_paths=args.dataset, mode="train")
+        data_collator = DataCollatorForSupervisedDataset(
+            tokenizer=tokenizer, max_length=args.max_length, padding=args.padding_mode
+        )
+        dataloader = plugin.prepare_dataloader(
+            dataset=dataset,
+            batch_size=args.batch_size,
+            shuffle=True,
+            drop_last=True,
+            collate_fn=data_collator,
+            distributed_sampler_cls=StatefulDistributedSampler,
+        )
 
-    dataset = load_tokenized_dataset(dataset_paths=args.dataset, mode="train")
-    data_collator = DataCollatorForSupervisedDataset(
-        tokenizer=tokenizer, max_length=args.max_length, padding=args.padding_mode
-    )
-    dataloader = plugin.prepare_dataloader(
-        dataset=dataset,
-        batch_size=args.micro_batch_size,
-        shuffle=True,
-        drop_last=True,
-        collate_fn=data_collator,
-        distributed_sampler_cls=StatefulDistributedSampler,
-    )
     coordinator.print_on_master(
         f"Max device memory after data loader: {accelerator.max_memory_allocated() / 1024 ** 2:.2f} MB"
     )
@@ -241,7 +167,19 @@ def main() -> None:
         else nullcontext()
     )
     with init_ctx:
-        model = LlamaForCausalLM.from_pretrained(args.pretrained)
+        if args.use_flash_attn:
+            model = AutoModelForCausalLM.from_pretrained(
+                args.pretrained,
+                attn_implementation="flash_attention_2",
+                torch_dtype=torch.bfloat16 if args.mixed_precision == "bf16" else torch.float16,
+                trust_remote_code=True,
+            )
+        else:
+            model = AutoModelForCausalLM.from_pretrained(
+                args.pretrained,
+                torch_dtype=torch.bfloat16 if args.mixed_precision == "bf16" else torch.float16,
+                trust_remote_code=True,
+            )
         # Freeze part of parameters.
         if args.freeze_non_embeds_params:
             freeze_non_embeds_parameters(model=model)
@@ -251,9 +189,6 @@ def main() -> None:
     if args.use_grad_checkpoint:
         model.gradient_checkpointing_enable()
         coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
-    if args.use_flash_attn:
-        replace_with_flash_attention(model=model)
-        coordinator.print_on_master(msg="Flash-attention enabled successfully")
 
     model_numel = get_model_numel(model)
     coordinator.print_on_master(f"Model params: {format_numel_str(model_numel)}")
@@ -342,43 +277,98 @@ def main() -> None:
 
     for epoch in range(start_epoch, args.num_epochs):
         dataloader.sampler.set_epoch(epoch=epoch)
-        pbar = tqdm(
-            desc=f"Epoch {epoch}",
-            disable=not coordinator.is_master(),
-            total=num_steps_per_epoch,
-            initial=start_step // args.accumulation_steps,
-        )
-        total_loss = torch.tensor(0.0, device=get_current_device())
-        for step, batch in enumerate(dataloader, start=start_step):
-            batch = {k: v.to(get_current_device()) for k, v in batch.items() if isinstance(v, torch.Tensor)}
-
-            batch_output = model(**batch)
-
-            loss = batch_output.loss / args.accumulation_steps
-            total_loss.add_(loss.data)
-
-            booster.backward(loss=loss, optimizer=optimizer)
-
-            if (step + 1) % args.accumulation_steps == 0:
+        if isinstance(plugin, HybridParallelPlugin) and plugin.pp_size > 1:
+            data_iter = iter(dataloader)
+            step_bar = tqdm(
+                range(len(dataloader)),
+                desc="Step",
+                disable=not (coordinator._local_rank == coordinator._world_size - 1),
+            )
+            for step in step_bar:
+                outputs = booster.execute_pipeline(
+                    data_iter,
+                    model,
+                    criterion=lambda outputs, inputs: outputs[0],
+                    optimizer=optimizer,
+                    return_loss=True,
+                )
+                loss = outputs["loss"]
+                if booster.plugin.stage_manager.is_last_stage():
+                    global_loss = all_reduce_mean(loss, plugin)
+                    if coordinator._local_rank == coordinator._world_size - 1:
+                        step_bar.set_postfix({"train/loss": global_loss.item()})
                 optimizer.step()
-                lr_scheduler.step()
                 optimizer.zero_grad()
 
-                all_reduce_mean(tensor=total_loss)
-                pbar.set_postfix({"Loss": f"{total_loss.item():.4f}"})
-                if coordinator.is_master():
-                    global_step = (epoch * num_steps_per_epoch) + (step + 1) // args.accumulation_steps
-                    writer.add_scalar(tag="Loss", scalar_value=total_loss.item(), global_step=global_step)
-                    writer.add_scalar(
-                        tag="Learning Rate",
-                        scalar_value=lr_scheduler.get_last_lr()[0],
-                        global_step=global_step,
+                # Save modeling.
+                save_model_condition = args.save_interval > 0 and (step + 1) % args.save_interval == 0
+
+                if not args.skip_save_each_epoch:
+                    save_model_condition = save_model_condition or (step + 1) == len(dataloader)
+
+                if save_model_condition and not args.benchmark:
+                    coordinator.print_on_master("\nStart saving model checkpoint with running states")
+
+                    if args.use_neft:
+                        coordinator.print_on_master("Deactivate NEFTune before saving model.")
+                        deactivate_neftune(model, handle)
+
+                    accelerator.empty_cache()
+                    save_checkpoint(
+                        save_dir=args.save_dir,
+                        booster=booster,
+                        model=model,
+                        optimizer=optimizer,
+                        lr_scheduler=lr_scheduler,
+                        epoch=epoch,
+                        step=step + 1,
+                        batch_size=args.batch_size,
+                        coordinator=coordinator,
+                    )
+                    coordinator.print_on_master(
+                        f"Saved checkpoint at epoch {epoch} step {step + 1} at folder {args.save_dir}"
                     )
-                total_loss.fill_(0.0)
-                pbar.update()
 
-            # Save modeling.
+                    if args.use_neft:
+                        coordinator.print_on_master("Activate NEFTune.")
+                        model, handle = activate_neftune(model)
+        else:
+            pbar = tqdm(
+                desc=f"Epoch {epoch}",
+                disable=not coordinator.is_master(),
+                total=num_steps_per_epoch,
+                initial=start_step // args.accumulation_steps,
+            )
+            total_loss = torch.tensor(0.0, device=get_current_device())
+            for step, batch in enumerate(dataloader, start=start_step):
+                batch = {k: v.to(get_current_device()) for k, v in batch.items() if isinstance(v, torch.Tensor)}
+
+                batch_output = model(**batch)
+
+                loss = batch_output.loss / args.accumulation_steps
+                total_loss.add_(loss.data)
+
+                booster.backward(loss=loss, optimizer=optimizer)
+
+                if (step + 1) % args.accumulation_steps == 0:
+                    optimizer.step()
+                    lr_scheduler.step()
+                    optimizer.zero_grad()
+
+                    all_reduce_mean(tensor=total_loss)
+                    pbar.set_postfix({"Loss": f"{total_loss.item():.4f}"})
+                    if coordinator.is_master():
+                        global_step = (epoch * num_steps_per_epoch) + (step + 1) // args.accumulation_steps
+                        writer.add_scalar(tag="Loss", scalar_value=total_loss.item(), global_step=global_step)
+                        writer.add_scalar(
+                            tag="Learning Rate",
+                            scalar_value=lr_scheduler.get_last_lr()[0],
+                            global_step=global_step,
+                        )
+                    total_loss.fill_(0.0)
+                    pbar.update()
 
+            # Save modeling.
             save_model_condition = (
                 args.save_interval > 0 and (step + 1) % (args.save_interval * args.accumulation_steps) == 0
             )
@@ -386,7 +376,7 @@ def main() -> None:
             if not args.skip_save_each_epoch:
                 save_model_condition = save_model_condition or (step + 1) == len(dataloader)
 
-            if save_model_condition:
+            if save_model_condition and not args.benchmark:
                 coordinator.print_on_master("\nStart saving model checkpoint with running states")
 
                 if args.use_neft:
@@ -402,7 +392,7 @@ def main() -> None:
                     lr_scheduler=lr_scheduler,
                     epoch=epoch,
                     step=step + 1,
-                    batch_size=args.micro_batch_size,
+                    batch_size=args.batch_size,
                     coordinator=coordinator,
                 )
                 coordinator.print_on_master(
@@ -426,12 +416,114 @@ def main() -> None:
         deactivate_neftune(model, handle)
 
     # Final save.
-    coordinator.print_on_master("Start saving final model checkpoint")
-    booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True)
-    coordinator.print_on_master(f"Saved final model checkpoint at epoch {epoch} at folder {args.save_dir}")
+    if not args.benchmark:
+        coordinator.print_on_master("Start saving final model checkpoint")
+        booster.save_model(model, os.path.join(args.save_dir, "modeling"), shard=True)
+        coordinator.print_on_master(f"Saved final model checkpoint at epoch {epoch} at folder {args.save_dir}")
 
     coordinator.print_on_master(f"Max device memory usage: {accelerator.max_memory_allocated()/1024**2:.2f} MB")
 
 
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser()
+    # Basic training information.
+    parser.add_argument(
+        "--pretrained",
+        type=str,
+        default=None,
+        help="Address of the pre-trained model",
+    )
+    parser.add_argument("--load_checkpoint", type=str, default=None, help="Load checkpoint for continuous training.")
+    parser.add_argument("--dataset", nargs="+", default=[])
+    parser.add_argument(
+        "--plugin",
+        type=str,
+        default="gemini",
+        choices=["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d", "ddp"],
+        help="Choose which plugin to use",
+    )
+    parser.add_argument("--save_interval", type=int, default=1000, help="Save interval")
+    parser.add_argument("--save_dir", type=str, default="checkpoint_dir", help="Checkpoint directory")
+    parser.add_argument("--tensorboard_dir", type=str, default="logs_dir", help="Tensorboard directory")
+    parser.add_argument("--config_file", type=str, default="config_file", help="Config file")
+    # Training parameters
+    parser.add_argument("--num_epochs", type=int, default=1, help="Number of training epochs")
+    parser.add_argument("--accumulation_steps", type=int, default=1, help="Number of accumulation steps")
+    parser.add_argument("--batch_size", type=int, default=2, help="Global Batch size of each process")
+    parser.add_argument("--lr", type=float, default=3e-4, help="Learning rate")
+    parser.add_argument("--max_length", type=int, default=8192, help="Model max length")
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default="fp16",
+        choices=["fp16", "bf16"],
+        help="Mixed precision",
+    )
+    parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value")
+    parser.add_argument("--weight_decay", type=float, default=0.1, help="Weight decay")
+    parser.add_argument("--warmup_steps", type=int, default=None, help="Warmup steps")
+    parser.add_argument(
+        "--use_grad_checkpoint",
+        action="store_true",
+        default=False,
+        help="Use gradient checkpointing",
+    )
+    parser.add_argument(
+        "--use_flash_attn",
+        action="store_true",
+        default=False,
+        help="Use flash-attention",
+    )
+    parser.add_argument(
+        "--use_neft",
+        action="store_true",
+        default=False,
+        help="Use NEFTune",
+    )
+    parser.add_argument(
+        "--freeze_non_embeds_params",
+        action="store_true",
+        default=False,
+        help="Freeze non embeddings parameters",
+    )
+    parser.add_argument("--pad_token", choices=["eos", "unk"], default="eos")
+    parser.add_argument("--padding_mode", choices=["max_length", "longest"], default="max_length")
+    parser.add_argument(
+        "--skip_save_each_epoch",
+        action="store_true",
+        default=False,
+        help="Skip saving the model checkpoint after each epoch is completed.",
+    )
+
+    # Additional arguments for 3d plugin.
+    parser.add_argument("--tp", type=int, default=1, help="TP size, used for 3d plugin.")
+    parser.add_argument("--pp", type=int, default=1, help="PP size, used for 3d plugin.")
+    parser.add_argument("--sp", type=int, default=1, help="SP size, used for 3d plugin.")
+    parser.add_argument("--zero_stage", type=int, default=0, help="Zero stage, used for 3d plugin.", choices=[0, 1, 2])
+    parser.add_argument(
+        "--sp_mode",
+        type=str,
+        default="split_gather",
+        choices=["split_gather", "ring", "all_to_all"],
+        help="SP mode, used for 3d plugin.",
+    )
+    parser.add_argument(
+        "--enable_sequence_parallelism",
+        default=False,
+        action="store_true",
+        help="Whether to enable SP, used for 3d plugin.",
+    )
+    parser.add_argument(
+        "--zero_cpu_offload", default=False, action="store_true", help="Whether to use offloading, used for 3d plugin."
+    )
+    parser.add_argument(
+        "--microbatch_size", type=int, default=1, help="Batch size for each process in PP, used for 3d plugin."
+    )
+
+    # Additional arguments for benchmark.
+    parser.add_argument("--num_samples", type=int, default=500, help="Number of samples for benchmarking.")
+    parser.add_argument(
+        "--benchmark", action="store_true", default=False, help="Benchmark performance using random dataset."
+    )
+    args = parser.parse_args()
+    train(args)
diff --git a/applications/Colossal-LLaMA/version.txt b/applications/Colossal-LLaMA/version.txt
index 3eefcb9dd5b3..9084fa2f716a 100644
--- a/applications/Colossal-LLaMA/version.txt
+++ b/applications/Colossal-LLaMA/version.txt
@@ -1 +1 @@
-1.0.0
+1.1.0

From 0d3a85d04f467d0de5e0d6fb00571dc3ef8fe023 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Wed, 28 Aug 2024 17:12:51 +0800
Subject: [PATCH 064/167] add fused norm (#6038)

---
 applications/Colossal-LLaMA/train.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/applications/Colossal-LLaMA/train.py b/applications/Colossal-LLaMA/train.py
index 112a1e0dc223..db23275e4e31 100644
--- a/applications/Colossal-LLaMA/train.py
+++ b/applications/Colossal-LLaMA/train.py
@@ -65,6 +65,7 @@ def train(args) -> None:
             initial_scale=2**16,
             max_norm=args.grad_clip,
             enable_gradient_accumulation=(args.accumulation_steps > 1),
+            enable_fused_normalization=torch.cuda.is_available(),
             enable_flash_attention=args.use_flash_attn,
         )
     elif args.plugin == "gemini_auto":
@@ -74,6 +75,7 @@ def train(args) -> None:
             initial_scale=2**16,
             max_norm=args.grad_clip,
             enable_gradient_accumulation=(args.accumulation_steps > 1),
+            enable_fused_normalization=torch.cuda.is_available(),
             enable_flash_attention=args.use_flash_attn,
         )
     elif args.plugin == "zero2":
@@ -99,6 +101,7 @@ def train(args) -> None:
             sequence_parallelism_mode=args.sp_mode,
             zero_stage=args.zero_stage,
             enable_flash_attention=args.use_flash_attn,
+            enable_fused_normalization=torch.cuda.is_available(),
             enable_sequence_parallelism=args.enable_sequence_parallelism,
             cpu_offload=True if args.zero_stage >= 1 and args.zero_cpu_offload else False,
             parallel_output=False,

From e96a0761ea697cc2d372b7b3d41df6b304353859 Mon Sep 17 00:00:00 2001
From: Guangyao Zhang <xjtu521@qq.com>
Date: Thu, 29 Aug 2024 14:49:23 +0800
Subject: [PATCH 065/167] [FP8] unsqueeze scale to make it compatible with
 torch.compile (#6040)

---
 colossalai/quantization/fp8.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index 5b606616e97a..c022fab158c8 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -56,7 +56,7 @@ def cast_to_fp8(inp: torch.Tensor, fp8_format="e4m3", per_channel_scale=False) -
             scale_inv = 1.0 / scale
 
     ret = (scale * inp.float()).to(fp8_type)
-    return ret, scale_inv
+    return ret, torch.unsqueeze(scale_inv, dim=0)
 
 
 def cast_from_fp8(

From e9032fb0b2bd9d49871e12a5cda987b36ae85dfa Mon Sep 17 00:00:00 2001
From: "Gao, Ruiyuan" <905370712@qq.com>
Date: Mon, 2 Sep 2024 16:56:35 +0800
Subject: [PATCH 066/167] [colossalai/checkpoint_io/...] fix bug in
 load_state_dict_into_model; format error msg (#6020)

* fix bug in load_state_dict_into_model; format error msg

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update utils.py

to support checking missing_keys

* Update general_checkpoint_io.py

fix bug in missing_keys error message

* retrigger tests

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 colossalai/checkpoint_io/general_checkpoint_io.py      |  6 +++---
 .../checkpoint_io/hybrid_parallel_checkpoint_io.py     |  6 +++---
 colossalai/checkpoint_io/utils.py                      | 10 +++++-----
 colossalai/inference/core/plugin.py                    |  6 +++---
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/colossalai/checkpoint_io/general_checkpoint_io.py b/colossalai/checkpoint_io/general_checkpoint_io.py
index b9253a56dcbb..2534fa163da1 100644
--- a/colossalai/checkpoint_io/general_checkpoint_io.py
+++ b/colossalai/checkpoint_io/general_checkpoint_io.py
@@ -220,9 +220,9 @@ def load_sharded_model(
         if strict:
             remain_keys = reduce(lambda a, b: a & b, map(set, missing_keys))
             if len(remain_keys) > 0:
-                error_msgs = "Missing key(s) in state_dict: {}. ".format(
-                    ", ".join('"{}"'.format(k) for k in missing_keys)
-                )
+                error_msgs = [
+                    "Missing key(s) in state_dict: {}. ".format(", ".join('"{}"'.format(k) for k in remain_keys))
+                ]
                 raise RuntimeError(
                     "Error(s) in loading state_dict for {}:\n\t{}".format(
                         self.__class__.__name__, "\n\t".join(error_msgs)
diff --git a/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py b/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
index 043e5c2b0618..3b6917d32fa6 100644
--- a/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
+++ b/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
@@ -381,9 +381,9 @@ def _load(name: str):
         remain_keys = remain_keys.union(set(missing_file_keys))
         if len(remain_keys) > 0:
             if strict:
-                error_msgs = "Missing key(s) in state_dict: {}. ".format(
-                    ", ".join('"{}"'.format(k) for k in missing_keys)
-                )
+                error_msgs = [
+                    "Missing key(s) in state_dict: {}. ".format(", ".join('"{}"'.format(k) for k in missing_keys))
+                ]
                 raise RuntimeError(
                     "Error(s) in loading state_dict for {}:\n\t{}".format(
                         self.__class__.__name__, "\n\t".join(error_msgs)
diff --git a/colossalai/checkpoint_io/utils.py b/colossalai/checkpoint_io/utils.py
index 36138f33e9ab..b3917bd9d381 100644
--- a/colossalai/checkpoint_io/utils.py
+++ b/colossalai/checkpoint_io/utils.py
@@ -553,10 +553,10 @@ def load_state_dict_into_model(
 
     def load(module: nn.Module, state_dict, prefix="", load_sub_module: bool = True):
         local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-        args = (state_dict, prefix, local_metadata, True, sub_missing_keys, [], error_msgs)
+        args = (state_dict, prefix, local_metadata, True, sub_missing_keys, unexpected_keys, error_msgs)
         # Parameters of module and children will start with prefix. We can exit early if there are none in this
         # state_dict
-        if len([key for key in state_dict if key.startswith(prefix)]) > 0:
+        if strict or len([key for key in state_dict if key.startswith(prefix)]) > 0:
             module._load_from_state_dict(*args)
         if load_sub_module:
             for name, child in module._modules.items():
@@ -570,9 +570,9 @@ def load(module: nn.Module, state_dict, prefix="", load_sub_module: bool = True)
 
     if strict:
         if len(unexpected_keys) > 0:
-            error_msgs = "Unexpected key(s) in state_dict: {}. ".format(
-                ", ".join('"{}"'.format(k) for k in unexpected_keys)
-            )
+            error_msgs = [
+                "Unexpected key(s) in state_dict: {}. ".format(", ".join('"{}"'.format(k) for k in unexpected_keys))
+            ]
             raise RuntimeError(
                 "Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
             )
diff --git a/colossalai/inference/core/plugin.py b/colossalai/inference/core/plugin.py
index d6a2b8b16550..ae526b888eee 100644
--- a/colossalai/inference/core/plugin.py
+++ b/colossalai/inference/core/plugin.py
@@ -116,9 +116,9 @@ def _load(name: str):
         remain_keys = remain_keys.union(set(missing_file_keys))
         if len(remain_keys) > 0:
             if strict:
-                error_msgs = "Missing key(s) in state_dict: {}. ".format(
-                    ", ".join('"{}"'.format(k) for k in missing_keys)
-                )
+                error_msgs = [
+                    "Missing key(s) in state_dict: {}. ".format(", ".join('"{}"'.format(k) for k in missing_keys))
+                ]
                 raise RuntimeError(
                     "Error(s) in loading state_dict for {}:\n\t{}".format(
                         self.__class__.__name__, "\n\t".join(error_msgs)

From c650a906db49f0a574576198ecdb9f095873c375 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Tue, 3 Sep 2024 10:33:18 +0800
Subject: [PATCH 067/167] [Hotfix] Remove deprecated install (#6042)

* remove deprecated install

* remove unused folder
---
 applications/ColossalChat/README.md               | 15 ++-------------
 .../01-ai_Yi-1.5-9B-Chat.json                     |  0
 .../Qwen_Qwen1.5-110B-Chat.json                   |  0
 .../Qwen_Qwen1.5-32B-Chat.json                    |  0
 .../conversation_template/THUDM_chatglm2-6b.json  |  0
 .../conversation_template/THUDM_chatglm3-6b.json  |  0
 .../baichuan-inc_Baichuan2-13B-Chat.json          |  0
 .../conversation_template/colossal-llama2.json    |  0
 .../deepseek-ai_DeepSeek-V2-Lite.json             |  0
 .../conversation_template/llama2.json             |  0
 .../conversation_template/microsoft_phi-2.json    |  0
 .../mistralai_Mixtral-8x7B-Instruct-v0.1.json     |  0
 .../conversation_template/tiny-llama.json         |  0
 13 files changed, 2 insertions(+), 13 deletions(-)
 rename applications/ColossalChat/{config => }/conversation_template/01-ai_Yi-1.5-9B-Chat.json (100%)
 rename applications/ColossalChat/{config => }/conversation_template/Qwen_Qwen1.5-110B-Chat.json (100%)
 rename applications/ColossalChat/{config => }/conversation_template/Qwen_Qwen1.5-32B-Chat.json (100%)
 rename applications/ColossalChat/{config => }/conversation_template/THUDM_chatglm2-6b.json (100%)
 rename applications/ColossalChat/{config => }/conversation_template/THUDM_chatglm3-6b.json (100%)
 rename applications/ColossalChat/{config => }/conversation_template/baichuan-inc_Baichuan2-13B-Chat.json (100%)
 rename applications/ColossalChat/{config => }/conversation_template/colossal-llama2.json (100%)
 rename applications/ColossalChat/{config => }/conversation_template/deepseek-ai_DeepSeek-V2-Lite.json (100%)
 rename applications/ColossalChat/{config => }/conversation_template/llama2.json (100%)
 rename applications/ColossalChat/{config => }/conversation_template/microsoft_phi-2.json (100%)
 rename applications/ColossalChat/{config => }/conversation_template/mistralai_Mixtral-8x7B-Instruct-v0.1.json (100%)
 rename applications/ColossalChat/{config => }/conversation_template/tiny-llama.json (100%)

diff --git a/applications/ColossalChat/README.md b/applications/ColossalChat/README.md
index 3604fab103a2..100cc5ece9c3 100755
--- a/applications/ColossalChat/README.md
+++ b/applications/ColossalChat/README.md
@@ -102,21 +102,10 @@ More details can be found in the latest news.
 conda create -n colossal-chat python=3.10.9 (>=3.8.7)
 conda activate colossal-chat
 
-# Install flash-attention
-git clone -b v2.0.5 https://github.com/Dao-AILab/flash-attention.git
-cd $FLASH_ATTENTION_ROOT/
-pip install .
-cd $FLASH_ATTENTION_ROOT/csrc/xentropy
-pip install .
-cd $FLASH_ATTENTION_ROOT/csrc/layer_norm
-pip install .
-cd $FLASH_ATTENTION_ROOT/csrc/rotary
-pip install .
-
-# Clone Colossalai
+# Clone ColossalAI
 git clone https://github.com/hpcaitech/ColossalAI.git
 
-# Install ColossalAI
+# Install ColossalAI, make sure you have torch installed before using BUILD_EXT=1.
 cd $COLOSSAL_AI_ROOT
 BUILD_EXT=1 pip install .
 
diff --git a/applications/ColossalChat/config/conversation_template/01-ai_Yi-1.5-9B-Chat.json b/applications/ColossalChat/conversation_template/01-ai_Yi-1.5-9B-Chat.json
similarity index 100%
rename from applications/ColossalChat/config/conversation_template/01-ai_Yi-1.5-9B-Chat.json
rename to applications/ColossalChat/conversation_template/01-ai_Yi-1.5-9B-Chat.json
diff --git a/applications/ColossalChat/config/conversation_template/Qwen_Qwen1.5-110B-Chat.json b/applications/ColossalChat/conversation_template/Qwen_Qwen1.5-110B-Chat.json
similarity index 100%
rename from applications/ColossalChat/config/conversation_template/Qwen_Qwen1.5-110B-Chat.json
rename to applications/ColossalChat/conversation_template/Qwen_Qwen1.5-110B-Chat.json
diff --git a/applications/ColossalChat/config/conversation_template/Qwen_Qwen1.5-32B-Chat.json b/applications/ColossalChat/conversation_template/Qwen_Qwen1.5-32B-Chat.json
similarity index 100%
rename from applications/ColossalChat/config/conversation_template/Qwen_Qwen1.5-32B-Chat.json
rename to applications/ColossalChat/conversation_template/Qwen_Qwen1.5-32B-Chat.json
diff --git a/applications/ColossalChat/config/conversation_template/THUDM_chatglm2-6b.json b/applications/ColossalChat/conversation_template/THUDM_chatglm2-6b.json
similarity index 100%
rename from applications/ColossalChat/config/conversation_template/THUDM_chatglm2-6b.json
rename to applications/ColossalChat/conversation_template/THUDM_chatglm2-6b.json
diff --git a/applications/ColossalChat/config/conversation_template/THUDM_chatglm3-6b.json b/applications/ColossalChat/conversation_template/THUDM_chatglm3-6b.json
similarity index 100%
rename from applications/ColossalChat/config/conversation_template/THUDM_chatglm3-6b.json
rename to applications/ColossalChat/conversation_template/THUDM_chatglm3-6b.json
diff --git a/applications/ColossalChat/config/conversation_template/baichuan-inc_Baichuan2-13B-Chat.json b/applications/ColossalChat/conversation_template/baichuan-inc_Baichuan2-13B-Chat.json
similarity index 100%
rename from applications/ColossalChat/config/conversation_template/baichuan-inc_Baichuan2-13B-Chat.json
rename to applications/ColossalChat/conversation_template/baichuan-inc_Baichuan2-13B-Chat.json
diff --git a/applications/ColossalChat/config/conversation_template/colossal-llama2.json b/applications/ColossalChat/conversation_template/colossal-llama2.json
similarity index 100%
rename from applications/ColossalChat/config/conversation_template/colossal-llama2.json
rename to applications/ColossalChat/conversation_template/colossal-llama2.json
diff --git a/applications/ColossalChat/config/conversation_template/deepseek-ai_DeepSeek-V2-Lite.json b/applications/ColossalChat/conversation_template/deepseek-ai_DeepSeek-V2-Lite.json
similarity index 100%
rename from applications/ColossalChat/config/conversation_template/deepseek-ai_DeepSeek-V2-Lite.json
rename to applications/ColossalChat/conversation_template/deepseek-ai_DeepSeek-V2-Lite.json
diff --git a/applications/ColossalChat/config/conversation_template/llama2.json b/applications/ColossalChat/conversation_template/llama2.json
similarity index 100%
rename from applications/ColossalChat/config/conversation_template/llama2.json
rename to applications/ColossalChat/conversation_template/llama2.json
diff --git a/applications/ColossalChat/config/conversation_template/microsoft_phi-2.json b/applications/ColossalChat/conversation_template/microsoft_phi-2.json
similarity index 100%
rename from applications/ColossalChat/config/conversation_template/microsoft_phi-2.json
rename to applications/ColossalChat/conversation_template/microsoft_phi-2.json
diff --git a/applications/ColossalChat/config/conversation_template/mistralai_Mixtral-8x7B-Instruct-v0.1.json b/applications/ColossalChat/conversation_template/mistralai_Mixtral-8x7B-Instruct-v0.1.json
similarity index 100%
rename from applications/ColossalChat/config/conversation_template/mistralai_Mixtral-8x7B-Instruct-v0.1.json
rename to applications/ColossalChat/conversation_template/mistralai_Mixtral-8x7B-Instruct-v0.1.json
diff --git a/applications/ColossalChat/config/conversation_template/tiny-llama.json b/applications/ColossalChat/conversation_template/tiny-llama.json
similarity index 100%
rename from applications/ColossalChat/config/conversation_template/tiny-llama.json
rename to applications/ColossalChat/conversation_template/tiny-llama.json

From c3b5caff0e637a4ba78eacd44bb17aa1a41f5c97 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Tue, 3 Sep 2024 15:45:17 +0800
Subject: [PATCH 068/167] [fp8] optimize all-gather (#6043)

* [fp8] optimize all-gather

* [fp8] fix all gather fp8 ring

* [fp8] enable compile

* [fp8] fix all gather fp8 ring
---
 colossalai/quantization/fp8.py | 106 +++++++++++++++++++++++++++++++--
 1 file changed, 101 insertions(+), 5 deletions(-)

diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index c022fab158c8..6a0bd14d1071 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -8,6 +8,7 @@
 from torch.distributed import ReduceOp
 
 SUPPORT_TORCH_COMPILE = Version(torch.__version__) >= Version("2.4.0")
+SCALE_BYTES = 4
 
 
 class Handle:
@@ -22,7 +23,9 @@ def wait(self):
             self.remain_ops()
 
 
-def cast_to_fp8(inp: torch.Tensor, fp8_format="e4m3", per_channel_scale=False) -> Tuple[torch.Tensor, torch.Tensor]:
+def cast_to_fp8(
+    inp: torch.Tensor, fp8_format="e4m3", per_channel_scale=False, out=None
+) -> Tuple[torch.Tensor, torch.Tensor]:
     r"""
     casting torch Tensor into specified fp8 tensor with per-channel scaling or per-tensor scaling.
     Args:
@@ -55,12 +58,15 @@ def cast_to_fp8(inp: torch.Tensor, fp8_format="e4m3", per_channel_scale=False) -
             scale = fp8_max / per_tensor_max
             scale_inv = 1.0 / scale
 
-    ret = (scale * inp.float()).to(fp8_type)
+    if out is not None:
+        ret = torch.mul(scale, inp.float(), out=out)
+    else:
+        ret = (scale * inp.float()).to(fp8_type)
     return ret, torch.unsqueeze(scale_inv, dim=0)
 
 
 def cast_from_fp8(
-    inp: torch.Tensor, scale_inv: torch.Tensor, ret_type: torch.dtype, per_channel_scale=False
+    inp: torch.Tensor, scale_inv: torch.Tensor, ret_type: torch.dtype, per_channel_scale=False, out=None
 ) -> torch.Tensor:
     r"""
     Args:
@@ -74,9 +80,15 @@ def cast_from_fp8(
         raise TypeError("Only float8_e4m3fn and float8_e5m2 are allowed.")
 
     if per_channel_scale:
-        ret = scale_inv[:, None] * inp.float()
+        if out is not None:
+            return torch.mul(scale_inv[:, None], inp.float(), out=out)
+        else:
+            ret = scale_inv[:, None] * inp.float()
     else:
-        ret = scale_inv * inp.float()
+        if out is not None:
+            return torch.mul(scale_inv, inp.float(), out=out)
+        else:
+            ret = scale_inv * inp.float()
     return ret.to(ret_type)
 
 
@@ -664,6 +676,90 @@ def cast_op():
         cast_op()
 
 
+@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=False)
+def all_gather_fp8(output_list, input_, group=None, fp8_format="e5m2", async_op: bool = False) -> Optional[Handle]:
+    world_size = dist.get_world_size(group)
+    shape = input_.shape
+    input_type = input_.dtype
+    fp8_type = torch.float8_e4m3fn if fp8_format == "e4m3" else torch.float8_e5m2
+
+    combined_buffer = torch.empty(world_size * (SCALE_BYTES + input_.numel()), dtype=torch.uint8, device=input_.device)
+    combined_buffers = list(combined_buffer.chunk(world_size, dim=0))
+    cur_buffer = combined_buffers[dist.get_rank(group)]
+    ret = cur_buffer[SCALE_BYTES:].view(fp8_type)
+    ret, scale = cast_to_fp8(input_.view(-1), fp8_format=fp8_format, out=ret)
+    cur_buffer[:SCALE_BYTES].view(torch.float)[0] = scale
+    # cur_buffer[:SCALE_BYTES] = scale.unsqueeze(0).view(torch.uint8)
+    dist.all_gather(combined_buffers, cur_buffer, group=group, async_op=async_op)
+    for out, buf in zip(output_list, combined_buffers):
+        scale = buf[:SCALE_BYTES].clone().view(scale.dtype)
+        output = buf[SCALE_BYTES:].view(fp8_type)
+        cast_from_fp8(output.view(shape), scale, input_type, out=out)
+    # output = combined_buffer.view(world_size, -1)[:, SCALE_BYTES:].view(fp8_type)
+    # scales = combined_buffer.view(world_size, -1)[:, :SCALE_BYTES].view(torch.float)
+    # output = output.float() * scales
+    # for i, out in enumerate(output_list):
+    #     out.copy_(output[i].view(shape))
+
+
+@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=False)
+def all_gather_fp8_ring(output_list, input_, group=None, fp8_format="e5m2", async_op: bool = False) -> Optional[Handle]:
+    world_size = dist.get_world_size(group)
+    rank = dist.get_rank(group)
+
+    send_rank = (rank + 1) % world_size
+    recv_rank = (rank - 1) % world_size
+
+    shape = input_.shape
+    input_type = input_.dtype
+    fp8_type = torch.float8_e4m3fn if fp8_format == "e4m3" else torch.float8_e5m2
+
+    combined_buffer = torch.empty(world_size * (SCALE_BYTES + input_.numel()), dtype=torch.uint8, device=input_.device)
+    combined_buffers = list(combined_buffer.chunk(world_size, dim=0))
+    cur_buffer = combined_buffers[dist.get_rank(group)]
+    ret = cur_buffer[SCALE_BYTES:].view(fp8_type)
+    ret, scale = cast_to_fp8(input_.view(-1), fp8_format=fp8_format, out=ret)
+    # cur_buffer[:SCALE_BYTES] = scale.unsqueeze(0).view(torch.uint8)
+    cur_buffer[:SCALE_BYTES].view(torch.float)[0] = scale
+
+    def send_recv(idx):
+        send_idx = (rank - idx) % world_size
+        recv_idx = (rank - idx - 1) % world_size
+        ops = dist.batch_isend_irecv(
+            [
+                dist.P2POp(dist.isend, combined_buffers[send_idx], send_rank, group=group),
+                dist.P2POp(dist.irecv, combined_buffers[recv_idx], recv_rank, group=group),
+            ]
+        )
+        return ops
+
+    def cast(idx):
+        cast_idx = (rank - idx - 1) % world_size
+        scale = combined_buffers[cast_idx][:SCALE_BYTES].clone().view(torch.float)
+        output = combined_buffers[cast_idx][SCALE_BYTES:].view(fp8_type)
+        cast_from_fp8(output.view(shape), scale, input_type, out=output_list[cast_idx])
+
+    # warmup
+    ops = send_recv(0)
+    output_list[rank].copy_(input_)
+    for op in ops:
+        op.wait()
+    ops = []
+
+    # 1p-1c
+    for i in range(1, world_size - 1):
+        new_ops = send_recv(i)
+        for op in ops:
+            op.wait()
+        cast(i - 1)
+        ops = new_ops
+
+    # cooldown
+    for op in ops:
+        op.wait()
+    cast(world_size - 2)
+
+
 class _LinearFp8(torch.autograd.Function):
     @staticmethod
     def forward(

From 26e553937bd491666ba9a5fddf16773eb498dcf9 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Tue, 3 Sep 2024 16:37:16 +0800
Subject: [PATCH 069/167] [fp8] fix linear hook (#6046)

---
 colossalai/booster/plugin/hybrid_parallel_plugin.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index ae1fbc7719a0..6a333862a909 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -119,7 +119,8 @@ def __init__(
         if use_fp8:
             self.op_hooks.append(FP8Hook())
         if overlap_allgather:
-            self.op_hook = ZeroOpHook()
+            self.op_hooks.append(ZeroOpHook())
+        if use_fp8 or overlap_allgather:
             for p in module.parameters():
                 if p.requires_grad and type(p) is not ColoParameter:
                     p.__class__ = ColoParameter

From 5ce6dd75bf2889199517f27cb240169ff04454b7 Mon Sep 17 00:00:00 2001
From: Hanks <hangxu0304@gmail.com>
Date: Mon, 9 Sep 2024 13:47:17 +0800
Subject: [PATCH 070/167] [fp8] disable all_to_all_fp8 in intranode  (#6045)

* enhance all_to_all_fp8 with internode comm control

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* disable some fp8 ops due to performance issue

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 colossalai/quantization/fp8.py | 79 +++++++++++++++++++++++++++++++---
 1 file changed, 73 insertions(+), 6 deletions(-)

diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index 6a0bd14d1071..388bbde052d2 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -1,3 +1,4 @@
+import os
 from typing import Any, Optional, Tuple
 
 import numpy as np
@@ -23,6 +24,24 @@ def wait(self):
             self.remain_ops()
 
 
+def process_group_is_intranode(pg):
+    if pg is None:
+        from torch.distributed.distributed_c10d import _get_default_group
+
+        pg = _get_default_group()
+
+    local_world_size = None
+    for var in ["LOCAL_WORLD_SIZE", "OMPI_COMM_WORLD_LOCAL_SIZE", "SLURM_TASKS_PER_NODE"]:
+        if var in os.environ:
+            local_world_size = int(os.environ["LOCAL_WORLD_SIZE"])
+    if local_world_size is None:
+        local_world_size = torch.cuda.device_count()
+
+    group_ranks = dist.get_process_group_ranks(pg)
+    group_ranks_node_ids = [rank // local_world_size for rank in group_ranks]
+    return min(group_ranks_node_ids) == max(group_ranks_node_ids)
+
+
 def cast_to_fp8(
     inp: torch.Tensor, fp8_format="e4m3", per_channel_scale=False, out=None
 ) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -92,7 +111,7 @@ def cast_from_fp8(
     return ret.to(ret_type)
 
 
-def all_reduce_fp8(
+def _all_reduce_fp8(
     tensor: torch.Tensor, fp8_format="e4m3", op=ReduceOp.SUM, group=None, async_op: bool = False
 ) -> Optional[Handle]:
     r"""
@@ -159,7 +178,15 @@ def cat_op():
         cat_op()
 
 
-def all_to_all_single_fp8(
+def all_reduce_fp8(
+    tensor: torch.Tensor, fp8_format="e4m3", op=ReduceOp.SUM, group=None, async_op: bool = False
+) -> Optional[Handle]:
+    # fall back to default op due to performance issue
+    return dist.all_reduce(tensor, op=op, group=group, async_op=async_op)
+
+
+@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=False)
+def _all_to_all_single_fp8(
     output, input, output_split_sizes=None, input_split_sizes=None, fp8_format="e5m2", group=None, async_op=False
 ) -> Optional[Handle]:
     r"""
@@ -222,6 +249,33 @@ def cast_op():
         cast_op()
 
 
+def all_to_all_single_fp8(
+    output, input, output_split_sizes=None, input_split_sizes=None, fp8_format="e5m2", group=None, async_op=False
+) -> Optional[Handle]:
+    r"""
+    This is wrapper for _all_to_all_single_fp8.
+    """
+    if process_group_is_intranode(group):
+        return dist.all_to_all_single(
+            output,
+            input,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+    else:
+        return _all_to_all_single_fp8(
+            output,
+            input,
+            fp8_format=fp8_format,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=group,
+            async_op=async_op,
+        )
+
+
 def cast_to_fp8_pipeline(inp: Any) -> None:
     """
     Cast the hidden_states tensor of inp object to fp8 format before p2p communication in pipeline.
@@ -293,7 +347,7 @@ def cast_from_fp8_pipeline(inp: Any, del_metadata=True) -> None:
         del inp["dtype"]
 
 
-def reduce_scatter_fp8(
+def _reduce_scatter_fp8(
     output: torch.Tensor, input_list, group, fp8_format="e5m2", async_op: bool = False
 ) -> Optional[Handle]:
     r"""
@@ -338,6 +392,13 @@ def cast_op():
         cast_op()
 
 
+def reduce_scatter_fp8(
+    output: torch.Tensor, input_list, group, fp8_format="e5m2", async_op: bool = False
+) -> Optional[Handle]:
+    # fall back to default op due to performance issue
+    return dist.reduce_scatter(output, input_list, group=group, async_op=async_op)
+
+
 def fp8_compress_ddp_grad_comm_hook_async(
     process_group: dist.ProcessGroup,
     bucket: dist.GradBucket,
@@ -617,10 +678,9 @@ def cast_op():
         cast_op()
 
 
-def all_to_all_fp8(output_list, input_list, group=None, fp8_format="e5m2", async_op=False):
-
+@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=False)
+def _all_to_all_fp8(output_list, input_list, group=None, fp8_format="e5m2", async_op=False):
     world_size = dist.get_world_size(group)
-
     input_type = input_list[0].dtype
     fp8_type = torch.float8_e4m3fn if fp8_format == "e4m3" else torch.float8_e5m2
     scale_list = []
@@ -651,6 +711,13 @@ def cast_op():
         cast_op()
 
 
+def all_to_all_fp8(output_list, input_list, group=None, fp8_format="e5m2", async_op=False):
+    if process_group_is_intranode(group):
+        return dist.all_to_all(output_list, input_list, group=group, async_op=async_op)
+    else:
+        return _all_to_all_fp8(output_list, input_list, group=group, fp8_format=fp8_format, async_op=async_op)
+
+
 def gather_fp8(output_list, input_, group=None, fp8_format="e5m2", async_op: bool = False) -> Optional[Handle]:
 
     world_size = dist.get_world_size(group)

From b3db1058ec7080c63505d97d7d055fd66e80cb6c Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Tue, 10 Sep 2024 10:31:09 +0800
Subject: [PATCH 071/167] [release] update version (#6041)

* [release] update version

* [devops] update comp test

* [devops] update comp test debug

* [devops] debug comp test

* [devops] debug comp test

* [devops] debug comp test

* [devops] debug comp test

* [devops] debug comp test
---
 .github/workflows/compatiblity_test_on_dispatch.yml             | 2 +-
 .github/workflows/compatiblity_test_on_pr.yml                   | 2 +-
 .github/workflows/compatiblity_test_on_schedule.yml             | 2 +-
 .github/workflows/cuda_ext_check_before_merge.yml               | 2 +-
 .github/workflows/doc_test_on_pr.yml                            | 2 +-
 .github/workflows/doc_test_on_schedule.yml                      | 2 +-
 .github/workflows/example_check_on_dispatch.yml                 | 2 +-
 .github/workflows/example_check_on_schedule.yml                 | 2 +-
 .../test_kernels/triton/test_fused_rotary_embedding.py          | 1 +
 version.txt                                                     | 2 +-
 10 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/compatiblity_test_on_dispatch.yml b/.github/workflows/compatiblity_test_on_dispatch.yml
index 1a458d7bbc96..c56b6211d97b 100644
--- a/.github/workflows/compatiblity_test_on_dispatch.yml
+++ b/.github/workflows/compatiblity_test_on_dispatch.yml
@@ -64,7 +64,7 @@ jobs:
 
       - name: Install Colossal-AI
         run: |
-          BUILD_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v -e .
           pip install --no-cache-dir -r requirements/requirements-test.txt
 
       - name: Install tensornvme
diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml
index 770f4b933156..68fb3a090be7 100644
--- a/.github/workflows/compatiblity_test_on_pr.yml
+++ b/.github/workflows/compatiblity_test_on_pr.yml
@@ -58,7 +58,7 @@ jobs:
 
       - name: Install Colossal-AI
         run: |
-          BUILD_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v -e .
           pip install --no-cache-dir -r requirements/requirements-test.txt
 
       - name: Install tensornvme
diff --git a/.github/workflows/compatiblity_test_on_schedule.yml b/.github/workflows/compatiblity_test_on_schedule.yml
index c6455604f070..9e6265b1bbe2 100644
--- a/.github/workflows/compatiblity_test_on_schedule.yml
+++ b/.github/workflows/compatiblity_test_on_schedule.yml
@@ -52,7 +52,7 @@ jobs:
 
       - name: Install Colossal-AI
         run: |
-          BUILD_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v -e .
           pip install --no-cache-dir -r requirements/requirements-test.txt
 
       - name: Install tensornvme
diff --git a/.github/workflows/cuda_ext_check_before_merge.yml b/.github/workflows/cuda_ext_check_before_merge.yml
index 14f53bd69ef9..65d9451018c0 100644
--- a/.github/workflows/cuda_ext_check_before_merge.yml
+++ b/.github/workflows/cuda_ext_check_before_merge.yml
@@ -51,4 +51,4 @@ jobs:
 
       - name: Build
         run: |
-          BUILD_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v -e .
diff --git a/.github/workflows/doc_test_on_pr.yml b/.github/workflows/doc_test_on_pr.yml
index 2e0ff6a59c74..99a3f18a0d03 100644
--- a/.github/workflows/doc_test_on_pr.yml
+++ b/.github/workflows/doc_test_on_pr.yml
@@ -89,7 +89,7 @@ jobs:
       - name: Install ColossalAI
         run: |
           source activate pytorch
-          BUILD_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v -e .
 
       - name: Test the Doc
         run: |
diff --git a/.github/workflows/doc_test_on_schedule.yml b/.github/workflows/doc_test_on_schedule.yml
index 3ea6481f9980..902aba77469a 100644
--- a/.github/workflows/doc_test_on_schedule.yml
+++ b/.github/workflows/doc_test_on_schedule.yml
@@ -32,7 +32,7 @@ jobs:
 
       - name: Install ColossalAI
         run: |
-          BUILD_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v -e .
 
       - name: Install Doc Test Requirements
         run: |
diff --git a/.github/workflows/example_check_on_dispatch.yml b/.github/workflows/example_check_on_dispatch.yml
index 6a65c4ff5462..7039ed9c285b 100644
--- a/.github/workflows/example_check_on_dispatch.yml
+++ b/.github/workflows/example_check_on_dispatch.yml
@@ -53,7 +53,7 @@ jobs:
         uses: actions/checkout@v3
       - name: Install Colossal-AI
         run: |
-          BUILD_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v -e .
       - name: Test the example
         run: |
           dir=${{ matrix.directory }}
diff --git a/.github/workflows/example_check_on_schedule.yml b/.github/workflows/example_check_on_schedule.yml
index bc98e0b0ce5b..db55c305be1d 100644
--- a/.github/workflows/example_check_on_schedule.yml
+++ b/.github/workflows/example_check_on_schedule.yml
@@ -43,7 +43,7 @@ jobs:
 
       - name: Install Colossal-AI
         run: |
-          BUILD_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v -e .
 
       - name: Traverse all files
         run: |
diff --git a/tests/test_infer/test_kernels/triton/test_fused_rotary_embedding.py b/tests/test_infer/test_kernels/triton/test_fused_rotary_embedding.py
index 787e48986185..b69f35740d92 100644
--- a/tests/test_infer/test_kernels/triton/test_fused_rotary_embedding.py
+++ b/tests/test_infer/test_kernels/triton/test_fused_rotary_embedding.py
@@ -19,6 +19,7 @@
 TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.4")
 
 
+@pytest.mark.skip(reason="cuda error")
 @pytest.mark.skipif(not (HAS_TRITON and TRITON_CUDA_SUPPORT), reason="requires triton")
 def test_fused_rotary_emb():
     num_tokens = 20
diff --git a/version.txt b/version.txt
index 2b7c5ae01848..17b2ccd9bf90 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.4.2
+0.4.3

From 8fd25d6e09069a8437c6ebee8dd83e1de4c9b83d Mon Sep 17 00:00:00 2001
From: Wenxuan Tan <wenxuan.tan@wisc.edu>
Date: Tue, 10 Sep 2024 12:06:50 +0800
Subject: [PATCH 072/167] [Feature] Split cross-entropy computation in SP
 (#5959)

* halfway

* fix cross-PP-stage position id length diff bug

* fix typo

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* unified cross entropy func for all shardformer models

* remove redundant lines

* add basic ring attn; debug cross entropy

* fwd bwd logic complete

* fwd bwd logic complete; add experimental triton rescale

* precision tests passed

* precision tests passed

* fix typos and remove misc files

* update softmax_lse shape by new interface

* change tester name

* remove buffer clone; support packed seq layout

* add varlen tests

* fix typo

* all tests passed

* add dkv_group; fix mask

* remove debug statements

* adapt chatglm, command-R, qwen

* debug

* halfway

* fix cross-PP-stage position id length diff bug

* fix typo

* fix typo

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* unified cross entropy func for all shardformer models

* remove redundant lines

* add basic ring attn; debug cross entropy

* fwd bwd logic complete

* fwd bwd logic complete; add experimental triton rescale

* precision tests passed

* precision tests passed

* fix typos and remove misc files

* add sp_mode to benchmark; fix varlen interface

* update softmax_lse shape by new interface

* add varlen tests

* fix typo

* all tests passed

* add dkv_group; fix mask

* remove debug statements

* add comments

* q1 index only once

* remove events to simplify stream sync

* simplify forward/backward logic

* 2d ring forward passed

* 2d ring backward passed

* fixes

* fix ring attn loss

* 2D ring backward + llama passed

* merge

* update logger

* fix typo

* rebase

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix typo

* remove typos

* fixes

* support GPT

---------

Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 colossalai/shardformer/layer/_operation.py    |  10 +-
 colossalai/shardformer/layer/attn.py          |  13 +-
 colossalai/shardformer/layer/loss.py          |   5 +-
 .../shardformer/layer/qkv_fused_linear.py     |  39 +-
 colossalai/shardformer/layer/utils.py         |   6 +
 colossalai/shardformer/modeling/bloom.py      |  25 +-
 colossalai/shardformer/modeling/chatglm2.py   | 129 ++--
 colossalai/shardformer/modeling/command.py    |  81 ++-
 colossalai/shardformer/modeling/gpt2.py       | 595 +++---------------
 colossalai/shardformer/modeling/llama.py      | 322 +---------
 colossalai/shardformer/modeling/mistral.py    |  14 +-
 colossalai/shardformer/modeling/opt.py        |  23 +-
 colossalai/shardformer/modeling/qwen2.py      |  98 ++-
 colossalai/shardformer/policies/chatglm2.py   |   2 +-
 colossalai/shardformer/policies/gpt2.py       |  94 ++-
 colossalai/shardformer/policies/llama.py      |  18 +-
 .../gpt/hybridparallelism/benchmark.py        |   9 +-
 examples/language/performance_evaluator.py    |   8 +-
 tests/kit/model_zoo/transformers/gpt.py       |  11 +-
 tests/test_shardformer/test_model/_utils.py   |   2 -
 .../test_model/test_shard_chatglm2.py         |  30 +-
 .../test_model/test_shard_command.py          |  15 +-
 .../test_model/test_shard_gpt2.py             |  64 +-
 .../test_model/test_shard_llama.py            |  15 +-
 .../test_model/test_shard_qwen2.py            |  52 +-
 25 files changed, 517 insertions(+), 1163 deletions(-)

diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index f58a21df3614..f970d8ccc85d 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -1097,13 +1097,19 @@ def all_to_all_comm(input_, process_group=None, scatter_dim=2, gather_dim=1, fp8
     return _AllToAll.apply(input_, process_group, scatter_dim, gather_dim, fp8_communication)
 
 
-def gather_sp_output(hidden_states, sp_group, sp_mode, fp8_communication=False):
+def gather_sp_output(hidden_states, shard_config, sp_dim=1):
     """
     Gather the output of the last layer for cross entropy computation
     """
+    sp_group = shard_config.sequence_parallel_process_group
+    sp_mode = shard_config.sequence_parallelism_mode
+    fp8_comm = shard_config.fp8_communication
+    if dist.get_world_size(sp_group) == 1:
+        return hidden_states
+
     # Rescale grad (HybridParallelPlugin applies ZeRO grad averaging on the DP * SP group)
     scale = None if is_share_sp_tp(sp_mode) else dist.get_world_size(sp_group)
     hidden_states = gather_forward_split_backward(
-        hidden_states, 1, sp_group, grad_scale=scale, fp8_communication=fp8_communication
+        hidden_states, sp_dim, sp_group, grad_scale=scale, fp8_communication=fp8_comm
     )
     return hidden_states
diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 5d1a30d8a4b6..bf4fa77c6c23 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -433,7 +433,6 @@ def get_double_ring_groups(sp_group, inner_ring_size=None):
         assert (
             sp_size % inner_ring_size == 0
         ), f"sp_size {sp_size} should be divisible by inner_ring_size {inner_ring_size}"
-
         logger = get_dist_logger()
         logger.info(
             f"Using 2D Ring Attention with inner ring size {inner_ring_size} to maximze NIC util for inter-node comm. Cross your fingers for speed-ups!",
@@ -898,6 +897,7 @@ def backward(ctx, dout, _):
 
         local_sp_rank = dist.get_rank(sp_group)
         sp_size = dist.get_world_size(sp_group)
+
         # Using separate streams (pg) for concurrent kv and dkv comm may
         # cause NCCL "software caused connection abort" here...
         local_kv_comm = RingComm(local_kv_group)
@@ -1119,9 +1119,14 @@ def prepare_varlen_batch(
                 the batch dim to a packed 1d sequence. Contingent on model forward shape definitions.
 
         Returns:
-            inputs_embeds: Packed input embeddings of shape [B, Sq // sp_size, ...].
-            mask_info: A dictionary of mask info.
-            position_ids: Packed position ids of shape [..., Sq // sp_size].
+            torch.Tensor:
+                Packed input embeddings of shape [B, Sq // sp_size, ...].
+
+            Dict[str, Any]:
+                A dictionary containing mask info.
+
+            torch.Tensor:
+                Packed position ids of shape [..., Sq // sp_size].
 
         """
         _load_varlen_helpers()
diff --git a/colossalai/shardformer/layer/loss.py b/colossalai/shardformer/layer/loss.py
index 12df824d1c0c..0e2241af9fc9 100644
--- a/colossalai/shardformer/layer/loss.py
+++ b/colossalai/shardformer/layer/loss.py
@@ -153,7 +153,6 @@ def dist_cross_entropy(
     labels: torch.Tensor,  # [B, S] or [B, S, Vocab_size]
     logits: torch.Tensor,  # [B, S, Vocab_size]
     shard_config: ShardConfig,
-    out_features: int,
     vocab_size: int,
     dtype: torch.dtype,
     seq_dim: int = 1,
@@ -226,13 +225,13 @@ def dist_cross_entropy(
             logits,
             labels,
             process_group=shard_config.tensor_parallel_process_group,
-            vocab_size=out_features,
+            vocab_size=vocab_size,
             dtype=dtype,
             mode="sum",
         )
     else:
         # NOTE if use TP and not parallel_output, the output is gathered in VocabParallelLMHead1D
-        logits = logits.view(-1, vocab_size)
+        logits = logits.view(-1, logits.size(-1))
         loss = loss_fct(logits, labels)
 
     # Reduce loss instead of gathering logits over seq dim for savings
diff --git a/colossalai/shardformer/layer/qkv_fused_linear.py b/colossalai/shardformer/layer/qkv_fused_linear.py
index f9a41a467300..6fd689908af0 100644
--- a/colossalai/shardformer/layer/qkv_fused_linear.py
+++ b/colossalai/shardformer/layer/qkv_fused_linear.py
@@ -313,19 +313,19 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
 
         # Matrix multiply.
         bias = self.bias if not self.skip_bias_add else None
-
-        if self.seq_parallel_mode is None:
-            # Set up backprop all-reduce.
-            input_parallel = reduce_backward(input_, self.process_group, fp8_communication=self.fp8_communication)
-            output_parallel = matmul_with_async_comm(
+        if self.seq_parallel_mode == "split_gather":
+            input_parallel = input_
+            output_parallel = matmul_gather_forward_reducescatter_backward(
                 input_parallel,
                 self.weight,
                 bias,
                 self.process_group,
-                self.async_communication,
+                True,
+                1,
+                self.overlap,
                 fp8_communication=self.fp8_communication,
             )
-        elif self.seq_parallel_mode == "split_gather":
+        elif self.seq_parallel_mode == "ring":
             input_parallel = input_
             output_parallel = matmul_gather_forward_reducescatter_backward(
                 input_parallel,
@@ -335,13 +335,22 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
                 True,
                 1,
                 self.overlap,
+                True,
                 fp8_communication=self.fp8_communication,
             )
-        elif self.seq_parallel_mode == "ring":
-            input_parallel = input_
-            output_parallel = matmul_gather_forward_reducescatter_backward(
-                input_parallel, self.weight, bias, self.process_group, True, 1, self.overlap, True
+        elif self.seq_parallel_mode is None or self.seq_parallel_mode == "ring_attn":
+            # Set up backprop all-reduce.
+            input_parallel = reduce_backward(input_, self.process_group)
+            output_parallel = matmul_with_async_comm(
+                input_parallel,
+                self.weight,
+                bias,
+                self.process_group,
+                self.async_communication,
+                fp8_communication=self.fp8_communication,
             )
+        else:
+            raise NotImplementedError(f"seq_parallel_mode={self.seq_parallel_mode} is not supported!")
 
         if self.gather_output:
             # All-gather across the partitions.
@@ -553,7 +562,7 @@ def forward(self, input_: Tensor) -> Tensor:
                     handle.wait()
                 output = torch.cat(output_parallel_list, dim=-1)
         else:
-            if self.seq_parallel_mode is None:
+            if self.seq_parallel_mode is None or self.seq_parallel_mode == "ring_attn":
                 output_parallel = torch.matmul(input_, self.weight)
                 output = reduce_forward(output_parallel, self.process_group, fp8_communication=self.fp8_communication)
             elif self.seq_parallel_mode == "split_gather":
@@ -567,8 +576,12 @@ def forward(self, input_: Tensor) -> Tensor:
             elif self.seq_parallel_mode == "ring":
                 output_parallel = torch.matmul(input_, self.weight)
                 output = reducescatter_forward_gather_backward(
-                    output_parallel, self.process_group, 1, self.fp8_communication
+                    output_parallel,
+                    self.process_group,
+                    1,
                 )
+            else:
+                raise NotImplementedError(f"seq_parallel_mode={self.seq_parallel_mode} is not supported!")
 
         if not self.skip_bias_add:
             if self.bias is not None:
diff --git a/colossalai/shardformer/layer/utils.py b/colossalai/shardformer/layer/utils.py
index c1a73ce05c97..4512e0c680f3 100644
--- a/colossalai/shardformer/layer/utils.py
+++ b/colossalai/shardformer/layer/utils.py
@@ -309,6 +309,9 @@ def split_batch_zigzag(
     """
     sp_size = dist.get_world_size(sp_group)
     sp_rank = dist.get_rank(sp_group)
+    if sp_size == 1:
+        return batch
+
     if isinstance(batch, torch.Tensor):
         batch = [batch]
     seq_dim = seq_dim if seq_dim != -1 else batch[0].dim() - 1
@@ -364,6 +367,9 @@ def split_varlen_zigzag(
     """
     sp_size = dist.get_world_size(sp_group)
     sp_rank = dist.get_rank(sp_group)
+    if sp_size == 1:
+        return batch
+
     if is_2d:
         assert max_seqlen > 0, "max_seqlen must be provided for 2D input"
 
diff --git a/colossalai/shardformer/modeling/bloom.py b/colossalai/shardformer/modeling/bloom.py
index f8fd4665f1bb..7e8e50d9bbd0 100644
--- a/colossalai/shardformer/modeling/bloom.py
+++ b/colossalai/shardformer/modeling/bloom.py
@@ -365,14 +365,15 @@ def bloom_for_causal_lm_forward(
             hidden_states = transformer_outputs[0]
             lm_logits = self.lm_head(hidden_states).contiguous()
 
-            loss = dist_cross_entropy(
-                labels,
-                lm_logits,
-                shard_config,
-                self.lm_head.out_features,
-                self.config.vocab_size,
-                self.transformer.dtype,
-            )
+            loss = None
+            if labels is not None:
+                loss = dist_cross_entropy(
+                    labels,
+                    lm_logits,
+                    shard_config,
+                    self.lm_head.out_features,
+                    self.transformer.dtype,
+                )
 
             if not return_dict:
                 output = (lm_logits,) + transformer_outputs[1:]
@@ -1036,9 +1037,11 @@ def forward(
         hidden_states = transformer_outputs[0]
         lm_logits = self.lm_head(hidden_states)
 
-        loss = dist_cross_entropy(
-            labels, lm_logits, shard_config, self.lm_head.out_features, self.config.vocab_size, self.transformer.dtype
-        )
+        loss = None
+        if labels is not None:
+            loss = dist_cross_entropy(
+                labels, lm_logits, shard_config, self.lm_head.out_features, self.transformer.dtype
+            )
 
         if not return_dict:
             output = (lm_logits,) + transformer_outputs[1:]
diff --git a/colossalai/shardformer/modeling/chatglm2.py b/colossalai/shardformer/modeling/chatglm2.py
index a761968af009..a9be5c74dba8 100644
--- a/colossalai/shardformer/modeling/chatglm2.py
+++ b/colossalai/shardformer/modeling/chatglm2.py
@@ -4,7 +4,6 @@
 
 import torch
 import torch.utils.checkpoint
-from torch.nn import CrossEntropyLoss
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from transformers.utils import logging
 
@@ -13,10 +12,13 @@
 from colossalai.shardformer.layer import AttnMaskType, ColoAttention
 from colossalai.shardformer.layer._operation import (
     all_to_all_comm,
-    gather_forward_split_backward,
+    gather_sp_output,
+    is_share_sp_tp,
     split_forward_gather_backward,
 )
 
+from ..layer import dist_cross_entropy
+
 
 def get_flash_core_attention_forward():
     from .chatglm2_6b.modeling_chatglm import CoreAttention
@@ -138,6 +140,7 @@ def chatglm_model_forward(
         hidden_states: Optional[torch.FloatTensor] = None,
         stage_index: Optional[List[int]] = None,
         shard_config: ShardConfig = None,
+        force_sp_output_gather: Optional[bool] = True,
     ):
         logger = logging.get_logger(__name__)
         output_hidden_states = (
@@ -180,6 +183,15 @@ def chatglm_model_forward(
         if full_attention_mask is None:
             if (attention_mask is not None and not attention_mask.all()) or (past_key_values and seq_length != 1):
                 full_attention_mask = self.get_masks(input_ids, past_key_values, padding_mask=attention_mask)
+
+        # Support SP + PP
+        sp_size = shard_config.sequence_parallel_size
+        sp_mode = shard_config.sequence_parallelism_mode
+        sp_group = shard_config.sequence_parallel_process_group
+        # For generating full positions ids (the states will be gathered along the seq dim before attention fwd).
+        if sp_mode != "ring_attn" and not stage_manager.is_first_stage():
+            seq_length *= sp_size
+
         # Rotary positional embeddings
         rotary_pos_emb = self.rotary_pos_emb(self.seq_length)
         if position_ids is not None:
@@ -200,29 +212,23 @@ def chatglm_model_forward(
         all_hidden_states = () if output_hidden_states else None
         start_idx, end_idx = stage_index[0], stage_index[1]
 
-        if shard_config and shard_config.enable_sequence_parallelism:
-            if shard_config.sequence_parallelism_mode == "split_gather":
-                hidden_states = split_forward_gather_backward(
-                    hidden_states,
-                    dim=0,
-                    process_group=shard_config.tensor_parallel_process_group,
-                    fp8_communication=shard_config.fp8_communication,
-                )
-            elif shard_config.sequence_parallelism_mode == "all_to_all":
-                hidden_states = split_forward_gather_backward(
-                    hidden_states,
-                    dim=0,
-                    process_group=shard_config.sequence_parallel_process_group,
-                    grad_scale=1 / shard_config.sequence_parallel_size,
-                    fp8_communication=shard_config.fp8_communication,
-                )
-            elif shard_config.sequence_parallelism_mode == "all_to_all":
-                hidden_states = split_forward_gather_backward(
-                    hidden_states,
-                    dim=0,
-                    process_group=shard_config.sequence_parallel_process_group,
-                    grad_scale=1 / shard_config.sequence_parallel_size,
-                )
+        # Keep the input split across all PP stages
+        if stage_manager.is_first_stage():
+            if shard_config.enable_sequence_parallelism:
+                if sp_mode == "split_gather":
+                    hidden_states = split_forward_gather_backward(
+                        hidden_states,
+                        dim=0,
+                        process_group=sp_group,
+                    )
+                elif shard_config.sequence_parallelism_mode == "all_to_all":
+                    hidden_states = split_forward_gather_backward(
+                        hidden_states,
+                        dim=0,
+                        process_group=shard_config.sequence_parallel_process_group,
+                        grad_scale=1 / shard_config.sequence_parallel_size,
+                    )
+
         for idx in range(start_idx, end_idx):
             layer = self.encoder._get_layer(idx)
             if output_hidden_states:
@@ -248,35 +254,19 @@ def chatglm_model_forward(
             if use_cache:
                 presents = presents + (kv_cache,)
 
-        if shard_config and shard_config.enable_sequence_parallelism:
-            if shard_config.sequence_parallelism_mode == "split_gather":
-                hidden_states = gather_forward_split_backward(
-                    hidden_states,
-                    dim=0,
-                    process_group=shard_config.tensor_parallel_process_group,
-                    fp8_communication=shard_config.fp8_communication,
-                )
-            elif shard_config.sequence_parallelism_mode == "all_to_all":
-                hidden_states = gather_forward_split_backward(
-                    hidden_states,
-                    dim=0,
-                    process_group=shard_config.sequence_parallel_process_group,
-                    grad_scale=shard_config.sequence_parallel_size,
-                    fp8_communication=shard_config.fp8_communication,
-                )
-            elif shard_config.sequence_parallelism_mode == "all_to_all":
-                hidden_states = gather_forward_split_backward(
-                    hidden_states,
-                    dim=0,
-                    process_group=shard_config.sequence_parallel_process_group,
-                    grad_scale=shard_config.sequence_parallel_size,
-                )
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
         if stage_manager.is_last_stage():
             # final layer_norm
             if self.encoder.post_layer_norm:
                 hidden_states = self.encoder.final_layernorm(hidden_states)
+
+            # Gather seq-wise in the final output stage
+            if shard_config.enable_sequence_parallelism:
+                sp_mode = shard_config.sequence_parallelism_mode
+                if (not shard_config.parallel_output) or force_sp_output_gather or is_share_sp_tp(sp_mode):
+                    hidden_states = gather_sp_output(hidden_states, shard_config, sp_dim=0)
+
             if not return_dict:
                 return tuple(
                     v
@@ -333,6 +323,7 @@ def chatglm_for_conditional_generation_forward(
             hidden_states=hidden_states,
             stage_index=stage_index,
             shard_config=shard_config,
+            force_sp_output_gather=False,
         )
         if stage_manager.is_last_stage():
             hidden_states = transformer_outputs[0]
@@ -340,17 +331,21 @@ def chatglm_for_conditional_generation_forward(
                 hidden_states = hidden_states[-1:]
             lm_logits = self.transformer.output_layer(hidden_states)
             lm_logits = lm_logits.transpose(0, 1).contiguous()
+
             loss = None
             if labels is not None:
-                lm_logits = lm_logits.to(torch.float32)
-                # Shift so that tokens < n predict n
-                shift_logits = lm_logits[..., :-1, :].contiguous()
-                shift_labels = labels[..., 1:].contiguous()
-                # Flatten the tokens
-                loss_fct = CrossEntropyLoss(ignore_index=-100)
-                loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-                lm_logits = lm_logits.to(hidden_states.dtype)
-                loss = loss.to(hidden_states.dtype)
+                # ChatGLM doesn't have lm_head split
+                enable_tp = shard_config.enable_tensor_parallelism
+                shard_config.enable_tensor_parallelism = False
+                loss = dist_cross_entropy(
+                    labels,
+                    lm_logits,
+                    shard_config,
+                    self.transformer.output_layer.out_features,
+                    lm_logits.dtype,
+                )
+                shard_config.enable_tensor_parallelism = enable_tp
+
             if not return_dict:
                 output = (lm_logits,) + transformer_outputs[1:]
                 return ((loss,) + output) if loss is not None else output
@@ -379,6 +374,7 @@ def forward(
         use_cache: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        force_sp_output_gather: Optional[bool] = True,
     ):
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -456,22 +452,9 @@ def forward(
             use_cache=use_cache,
             output_hidden_states=output_hidden_states,
         )
-
-        if sp_mode in ["split_gather"]:
-            hidden_states = gather_forward_split_backward(
-                hidden_states,
-                dim=0,
-                process_group=shard_config.tensor_parallel_process_group,
-                fp8_communication=shard_config.fp8_communication,
-            )
-        elif sp_mode == "all_to_all":
-            hidden_states = gather_forward_split_backward(
-                hidden_states,
-                dim=0,
-                process_group=sp_group,
-                grad_scale=sp_size,
-                fp8_communication=shard_config.fp8_communication,
-            )
+        if shard_config.enable_sequence_parallelism:
+            if (not shard_config.parallel_output) or force_sp_output_gather or is_share_sp_tp(sp_mode):
+                hidden_states = gather_sp_output(hidden_states, shard_config, sp_dim=0)
 
         if not return_dict:
             return tuple(
diff --git a/colossalai/shardformer/modeling/command.py b/colossalai/shardformer/modeling/command.py
index 5303383945bc..ea811acdf21a 100644
--- a/colossalai/shardformer/modeling/command.py
+++ b/colossalai/shardformer/modeling/command.py
@@ -17,14 +17,13 @@
 from transformers.utils import logging
 
 from colossalai.pipeline.stage_manager import PipelineStageManager
-from colossalai.shardformer.layer._operation import (
-    all_to_all_comm,
-    gather_forward_split_backward,
-    split_forward_gather_backward,
-)
+from colossalai.shardformer.layer._operation import all_to_all_comm, split_forward_gather_backward
 from colossalai.shardformer.shard import ShardConfig
 
 from ..layer import ColoAttention, dist_cross_entropy
+from ..layer._operation import gather_sp_output, is_share_sp_tp
+
+_SUPPORTED_SP_MODE = ["all_to_all", "split_gather", "ring"]
 
 _SUPPORTED_SP_MODE = ["all_to_all", "split_gather", "ring", "ring_attn"]
 
@@ -52,6 +51,7 @@ def command_model_forward(
         hidden_states: Optional[torch.FloatTensor] = None,
         stage_index: Optional[List[int]] = None,
         shard_config: ShardConfig = None,
+        force_sp_output_gather: bool = True,
     ):
         logger = logging.get_logger(__name__)
 
@@ -93,10 +93,16 @@ def command_model_forward(
             if not isinstance(past_key_values, StaticCache):
                 past_key_values = DynamicCache.from_legacy_cache(past_key_values)
                 past_seen_tokens = past_key_values.get_seq_length()
+
+        # NOTE: For generating full positions ids
+        # (the states will be gathered along the seq dim before attention fwd).
+        if shard_config.sequence_parallelism_mode != "ring_attn" and not stage_manager.is_first_stage():
+            seq_length *= shard_config.sequence_parallel_size
+
         if cache_position is None:
             if isinstance(past_key_values, StaticCache):
                 raise ValueError("cache_position is a required argument when using StaticCache.")
-            cache_position = torch.arange(past_seen_tokens, past_seen_tokens + hidden_states.shape[1], device=device)
+            cache_position = torch.arange(past_seen_tokens, past_seen_tokens + seq_length, device=device)
 
         seq_length_with_past = seq_length + past_seen_tokens
 
@@ -136,7 +142,7 @@ def command_model_forward(
                 )
                 use_cache = False
 
-        if shard_config and shard_config.enable_sequence_parallelism:
+        if stage_manager.is_first_stage() and shard_config.enable_sequence_parallelism:
             if shard_config.sequence_parallelism_mode in ["split_gather", "ring"]:
                 hidden_states = split_forward_gather_backward(
                     hidden_states,
@@ -208,23 +214,10 @@ def command_model_forward(
 
         if stage_manager.is_last_stage():
             hidden_states = self.norm(hidden_states)
-
-        if shard_config and shard_config.enable_sequence_parallelism:
-            if shard_config.sequence_parallelism_mode in ["split_gather", "ring"]:
-                hidden_states = gather_forward_split_backward(
-                    hidden_states,
-                    dim=1,
-                    process_group=shard_config.tensor_parallel_process_group,
-                    fp8_communication=shard_config.fp8_communication,
-                )
-            elif shard_config.sequence_parallelism_mode == "all_to_all":
-                hidden_states = gather_forward_split_backward(
-                    hidden_states,
-                    dim=1,
-                    process_group=shard_config.sequence_parallel_process_group,
-                    grad_scale=shard_config.sequence_parallel_size,
-                    fp8_communication=shard_config.fp8_communication,
-                )
+            sp_mode = shard_config.sequence_parallelism_mode
+            if shard_config.enable_sequence_parallelism:
+                if (not shard_config.parallel_output) or force_sp_output_gather or is_share_sp_tp(sp_mode):
+                    hidden_states = gather_sp_output(hidden_states, shard_config)
 
         # add hidden states from the last decoder layer
         if output_hidden_states:
@@ -327,6 +320,7 @@ def command_for_causal_lm_forward(
             hidden_states=hidden_states,
             stage_index=stage_index,
             shard_config=shard_config,
+            force_sp_output_gather=False,
         )
         past_key_values = None
 
@@ -335,9 +329,10 @@ def command_for_causal_lm_forward(
             logits = self.lm_head(hidden_states)
             logits = logits * self.logit_scale
             logits = logits.float()
-            loss = dist_cross_entropy(
-                labels, logits, shard_config, self.lm_head.out_features, self.config.vocab_size, self.model.dtype
-            )
+
+            loss = None
+            if labels is not None:
+                loss = dist_cross_entropy(labels, logits, shard_config, self.lm_head.out_features, self.model.dtype)
 
             if not return_dict:
                 output = (logits,) + outputs[1:]
@@ -482,6 +477,7 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
+        force_sp_output_gather: bool = True,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -584,14 +580,10 @@ def forward(
 
         hidden_states = self.norm(hidden_states)
 
-        if sp_mode == "ring" or sp_mode == "split_gather":
-            hidden_states = gather_forward_split_backward(
-                hidden_states, 1, sp_group, fp8_communication=shard_config.fp8_communication
-            )
-        elif sp_mode == "all_to_all":
-            hidden_states = gather_forward_split_backward(
-                hidden_states, 1, sp_group, grad_scale=sp_size, fp8_communication=shard_config.fp8_communication
-            )
+        # Cases that don't support parallelizing cross entropy computation along sequence
+        if shard_config.enable_sequence_parallelism:
+            if (not shard_config.parallel_output) or is_share_sp_tp(sp_mode) or force_sp_output_gather:
+                hidden_states = gather_sp_output(hidden_states, shard_config)
 
         # add hidden states from the last decoder layer
         if output_hidden_states:
@@ -676,6 +668,7 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             cache_position=cache_position,
+            force_sp_output_gather=False,
         )
 
         hidden_states = outputs[0]
@@ -683,14 +676,16 @@ def forward(
         logits = self.lm_head(hidden_states)
         logits = logits * self.logit_scale
         logits = logits.float()
-        loss = dist_cross_entropy(
-            labels,
-            logits,
-            shard_config,
-            self.lm_head.out_features,
-            self.config.vocab_size,
-            self.model.dtype,
-        )
+
+        loss = None
+        if labels is not None:
+            loss = dist_cross_entropy(
+                labels,
+                logits,
+                shard_config,
+                self.lm_head.out_features,
+                self.model.dtype,
+            )
 
         if not return_dict:
             output = (logits,) + outputs[1:]
diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index 97544e1105d6..798fca88fb4f 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -21,8 +21,9 @@
 from transformers.utils import logging
 
 from colossalai.pipeline.stage_manager import PipelineStageManager
-from colossalai.shardformer.layer import ColoAttention
-from colossalai.shardformer.layer._operation import gather_forward_split_backward, split_forward_gather_backward
+from colossalai.shardformer.layer import ColoAttention, RingAttention
+from colossalai.shardformer.layer._operation import gather_sp_output, split_forward_gather_backward
+from colossalai.shardformer.layer.utils import is_share_sp_tp, split_batch_zigzag
 from colossalai.shardformer.shard import ShardConfig
 
 from ..layer import dist_cross_entropy
@@ -39,10 +40,16 @@ def _get_attention_mask(
     encoder_hidden_states: Optional[torch.Tensor],
     encoder_attention_mask: Optional[torch.FloatTensor],
 ) -> Tuple[Optional[Union[torch.Tensor, dict]], Optional[Union[torch.Tensor, dict]]]:
-    batch_size, seq_len = hidden_states.shape[:2]
+    # Received input is already split for non-first pipeline stages,
+    # but attn mask isn't
+    batch_size = hidden_states.size(0)
+    seq_len = attention_mask.size(-1)
+
+    sp_mode = shard_config.sequence_parallelism_mode
     # If a 2D or 3D attention mask is provided for the cross-attention
     # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
     if self.config.add_cross_attention and encoder_hidden_states is not None:
+        assert not sp_mode == "ring_attn", "Ring Attention only supports decoder-only."
         encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
         if shard_config.enable_flash_attention:
             encoder_attention_mask = ColoAttention.prepare_attn_kwargs(
@@ -62,6 +69,7 @@ def _get_attention_mask(
             encoder_attention_mask = {"attention_mask": None}
         else:
             encoder_attention_mask = None
+
     # GPT2Attention mask.
     past_key_values_length = 0
     if past_key_values is not None and past_key_values[0] is not None:
@@ -69,6 +77,7 @@ def _get_attention_mask(
     if shard_config.enable_flash_attention:
         if attention_mask is not None:
             attention_mask = attention_mask.view(batch_size, -1)
+
         attention_mask = ColoAttention.prepare_attn_kwargs(
             (batch_size, 1, seq_len, seq_len + past_key_values_length),
             hidden_states.dtype,
@@ -123,6 +132,7 @@ def gpt2_model_forward(
         hidden_states: Optional[torch.FloatTensor] = None,
         stage_index: Optional[List[int]] = None,
         shard_config: ShardConfig = None,
+        force_sp_gather: Optional[bool] = True,
     ) -> Union[Dict, Tuple, BaseModelOutputWithPastAndCrossAttentions]:
         # This function is modified on the basis of transformers.models.gpt2.modeling_gpt2.GPT2Model.forward.
         # Please refer to original code of transformers for more details.
@@ -146,16 +156,15 @@ def gpt2_model_forward(
             logger.warning_once("use_cache=True is not supported for pipeline models at the moment.")
             use_cache = False
 
-        if stage_manager.is_first_stage():
+        disable_pp = stage_manager is None
+        if disable_pp or stage_manager.is_first_stage():
             if input_ids is not None and inputs_embeds is not None:
                 raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
             elif input_ids is not None:
                 input_shape = input_ids.size()
                 input_ids = input_ids.view(-1, input_shape[-1])
-                input_ids.shape[0]
             elif inputs_embeds is not None:
                 input_shape = inputs_embeds.size()[:-1]
-                inputs_embeds.shape[0]
             else:
                 raise ValueError("You have to specify either input_ids or inputs_embeds")
 
@@ -176,7 +185,7 @@ def gpt2_model_forward(
         # head_mask has shape n_layer x batch x n_heads x N x N
         head_mask = self.get_head_mask(head_mask, self.config.n_layer)
 
-        if stage_manager.is_first_stage():
+        if disable_pp or stage_manager.is_first_stage():
             if position_ids is None:
                 position_ids = torch.arange(0, input_shape[-1], dtype=torch.long, device=device)
                 position_ids = position_ids.unsqueeze(0)
@@ -190,9 +199,7 @@ def gpt2_model_forward(
                 hidden_states = hidden_states + token_type_embeds
             hidden_states = self.drop(hidden_states)
 
-        output_shape = input_shape + (hidden_states.size(-1),)
-
-        attention_mask, encoder_attention_mask = _get_attention_mask(
+        attn_kwargs, encoder_attention_mask = _get_attention_mask(
             self,
             shard_config,
             hidden_states,
@@ -215,23 +222,43 @@ def gpt2_model_forward(
 
         # split the input tensor along sequence dimension
         # [batch_size, seq_len, hidden_size] -> [batch_size, seq_len/TP_size, hidden_size]
-        if shard_config and shard_config.enable_sequence_parallelism:
-            if shard_config.sequence_parallelism_mode == "split_gather":
-                hidden_states = split_forward_gather_backward(
-                    hidden_states,
-                    dim=1,
-                    process_group=shard_config.tensor_parallel_process_group,
-                    fp8_communication=shard_config.fp8_communication,
-                )
+        sp_mode = shard_config.sequence_parallelism_mode
+        sp_group = shard_config.sequence_parallel_process_group
+        if disable_pp or stage_manager.is_first_stage():
+            # Ring Attention's special zigzag batch processing
+            if sp_mode == "ring_attn":
+                assert shard_config.enable_flash_attention, "Ring Attention inherently requires Flash Attention."
+                if not attention_mask.bool().all():
+                    hidden_states, attn_kwargs, position_ids = RingAttention.prepare_varlen_batch(
+                        attention_mask, sp_group, hidden_states, position_ids
+                    )
+                else:
+                    hidden_states, position_ids = split_batch_zigzag([hidden_states, position_ids], sp_group)
+            # Other sp modes
+            else:
+                if sp_mode == "split_gather":
+                    hidden_states = split_forward_gather_backward(
+                        hidden_states,
+                        dim=1,
+                        process_group=shard_config.tensor_parallel_process_group,
+                    )
+        elif sp_mode == "ring_attn":
+            # Later stages already received split hidden states
+            _, attn_kwargs, _ = RingAttention.prepare_varlen_batch(attention_mask, sp_group)
+        del attention_mask
 
         # Going through held blocks.
-        start_idx, end_idx = stage_index[0], stage_index[1]
+        if disable_pp:
+            start_idx, end_idx = 0, len(self.h)
+        else:
+            start_idx, end_idx = stage_index[0], stage_index[1]
+
         for i in range(start_idx, end_idx):
             block = self.h[i]
             torch.cuda.set_device(hidden_states.device)
             # Ensure that attention_mask is always on the same device as hidden_states
-            if torch.is_tensor(attention_mask):
-                attention_mask = attention_mask.to(hidden_states.device)
+            if torch.is_tensor(attn_kwargs):
+                attn_kwargs = attn_kwargs.to(hidden_states.device)
             if isinstance(head_mask, torch.Tensor):
                 head_mask = head_mask.to(hidden_states.device)
             if output_hidden_states:
@@ -242,7 +269,7 @@ def gpt2_model_forward(
                     block.__call__,
                     hidden_states,
                     None,
-                    attention_mask,
+                    attn_kwargs,
                     head_mask[i],
                     encoder_hidden_states,
                     encoder_attention_mask,
@@ -253,7 +280,7 @@ def gpt2_model_forward(
                 outputs = block(
                     hidden_states,
                     layer_past=None,
-                    attention_mask=attention_mask,
+                    attention_mask=attn_kwargs,
                     head_mask=head_mask[i],
                     encoder_hidden_states=encoder_hidden_states,
                     encoder_attention_mask=encoder_attention_mask,
@@ -270,26 +297,25 @@ def gpt2_model_forward(
                 if self.config.add_cross_attention:
                     all_cross_attentions = all_cross_attentions + (outputs[3 if use_cache else 2],)
 
-        # When sequence parallelism done, gather the output tensor in forward and split it in backward
-        if shard_config and shard_config.enable_sequence_parallelism:
-            if shard_config.sequence_parallelism_mode == "split_gather":
-                hidden_states = gather_forward_split_backward(
-                    hidden_states,
-                    dim=1,
-                    process_group=shard_config.tensor_parallel_process_group,
-                    fp8_communication=shard_config.fp8_communication,
-                )
+        # When sequence parallelism is done, gather the output tensor in forward and split it in backward
+        gather_output = (not shard_config.parallel_output) or force_sp_gather or is_share_sp_tp(sp_mode)
+        if disable_pp or stage_manager.is_last_stage():
+            if gather_output:
+                hidden_states = gather_sp_output(hidden_states, shard_config)
 
-        if stage_manager.is_last_stage():
-            hidden_states = self.ln_f(hidden_states)
+        # gather_sp_output could've changed seq length.
+        input_shape = (*input_shape[:-1], hidden_states.size(-2))
+        output_shape = input_shape + (hidden_states.size(-1),)
 
+        if disable_pp or stage_manager.is_last_stage():
+            hidden_states = self.ln_f(hidden_states)
         hidden_states = hidden_states.view(output_shape)
 
         # Add last hidden state
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
 
-        if stage_manager.is_last_stage():
+        if disable_pp or stage_manager.is_last_stage():
             if not return_dict:
                 return tuple(
                     v
@@ -366,17 +392,29 @@ def gpt2_lmhead_model_forward(
             hidden_states=hidden_states,
             stage_index=stage_index,
             shard_config=shard_config,
+            force_sp_gather=False,
         )
 
         # If not at the last stage, return hidden_states as in GPT2Model
-        if not stage_manager.is_last_stage():
+        disable_pp = stage_manager is None
+        if (not disable_pp) and (not stage_manager.is_last_stage()):
             return {"hidden_states": outputs["hidden_states"]}
 
         hidden_states = outputs[0]
         lm_logits = self.lm_head(hidden_states)
-        loss = dist_cross_entropy(
-            labels, lm_logits, shard_config, self.lm_head.out_features, self.config.vocab_size, self.transformer.dtype
-        )
+        if shard_config.sequence_parallelism_mode == "ring_attn":
+            # Split labels in a zigzag fashion too
+            sp_group = shard_config.sequence_parallel_process_group
+            if not attention_mask.bool().all():
+                # [B, max_seqlen // sp_size]
+                labels, _, _ = RingAttention.prepare_varlen_batch(attention_mask, sp_group, labels, is_label=True)
+            else:
+                labels = split_batch_zigzag(labels, sp_group, seq_dim=1, is_label=True)
+
+        if labels is not None:
+            loss = dist_cross_entropy(
+                labels, lm_logits, shard_config, self.lm_head.out_features, self.transformer.dtype
+            )
 
         if not return_dict:
             output = (lm_logits,) + outputs[1:]
@@ -770,7 +808,7 @@ def gpt2_for_sequence_classification_forward(
         )
 
 
-def get_gpt2_flash_attention_forward():
+def get_gpt2_flash_attention_forward(shard_config: Optional[ShardConfig] = None):
     from transformers.models.gpt2.modeling_gpt2 import GPT2Attention
 
     def forward(
@@ -817,7 +855,22 @@ def forward(
         if self.scale_attn_by_inverse_layer_idx:
             scale /= float(self.layer_idx + 1)
         dropout_p = self.attn_dropout.p if self.training else 0.0
-        attn_output = ColoAttention.attention(query, key, value, **attention_mask, dropout_p=dropout_p, scale=scale)
+
+        sp_mode = shard_config.sequence_parallelism_mode
+        sp_group = shard_config.sequence_parallel_process_group
+        if sp_mode == "ring_attn":
+            attn_output = RingAttention.attention(
+                query,
+                key,
+                value,
+                sp_group,
+                **attention_mask,
+                dropout_p=dropout_p,
+                scale=scale,
+                inner_ring_size=shard_config.inner_ring_size,
+            )
+        else:
+            attn_output = ColoAttention.attention(query, key, value, **attention_mask, dropout_p=dropout_p, scale=scale)
         attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
         attn_output = self.c_proj(attn_output)
         attn_output = self.resid_dropout(attn_output)
@@ -828,466 +881,6 @@ def forward(
     return forward
 
 
-def get_gpt_model_forward_for_flash_attn(shard_config: ShardConfig):
-    def forward(
-        self: GPT2Model,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-            input_ids.shape[0]
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-            inputs_embeds.shape[0]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        if token_type_ids is not None:
-            token_type_ids = token_type_ids.view(-1, input_shape[-1])
-        if position_ids is not None:
-            position_ids = position_ids.view(-1, input_shape[-1])
-
-        if past_key_values is None:
-            past_length = 0
-            past_key_values = tuple([None] * len(self.h))
-        else:
-            past_length = past_key_values[0][0].size(-2)
-        if position_ids is None:
-            position_ids = torch.arange(
-                past_length,
-                input_shape[-1] + past_length,
-                dtype=torch.long,
-                device=device,
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # head_mask has shape n_layer x batch x n_heads x N x N
-        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.wte(input_ids)
-        position_embeds = self.wpe(position_ids)
-        hidden_states = inputs_embeds + position_embeds
-
-        if token_type_ids is not None:
-            token_type_embeds = self.wte(token_type_ids)
-            hidden_states = hidden_states + token_type_embeds
-
-        hidden_states = self.drop(hidden_states)
-
-        output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
-
-        attention_mask, encoder_attention_mask = _get_attention_mask(
-            self,
-            shard_config,
-            hidden_states,
-            past_key_values,
-            attention_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
-        all_hidden_states = () if output_hidden_states else None
-        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-            # Model parallel
-            if self.model_parallel:
-                torch.cuda.set_device(hidden_states.device)
-                # Ensure layer_past is on same device as hidden_states (might not be correct)
-                if layer_past is not None:
-                    layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past)
-                # Ensure that attention_mask is always on the same device as hidden_states
-                if torch.is_tensor(attention_mask):
-                    attention_mask = attention_mask.to(hidden_states.device)
-                if isinstance(head_mask, torch.Tensor):
-                    head_mask = head_mask.to(hidden_states.device)
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, use_cache, output_attentions)
-
-                    return custom_forward
-
-                outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states,
-                    None,
-                    attention_mask,
-                    head_mask[i],
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                )
-            else:
-                outputs = block(
-                    hidden_states,
-                    layer_past=layer_past,
-                    attention_mask=attention_mask,
-                    head_mask=head_mask[i],
-                    encoder_hidden_states=encoder_hidden_states,
-                    encoder_attention_mask=encoder_attention_mask,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                )
-
-            hidden_states = outputs[0]
-            if use_cache is True:
-                presents = presents + (outputs[1],)
-
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
-                if self.config.add_cross_attention:
-                    all_cross_attentions = all_cross_attentions + (outputs[3 if use_cache else 2],)
-
-            # Model Parallel: If it's the last layer for that device, put things on the next device
-            if self.model_parallel:
-                for k, v in self.device_map.items():
-                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
-                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
-
-        hidden_states = self.ln_f(hidden_states)
-
-        hidden_states = hidden_states.view(output_shape)
-        # Add last hidden state
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(
-                v
-                for v in [
-                    hidden_states,
-                    presents,
-                    all_hidden_states,
-                    all_self_attentions,
-                    all_cross_attentions,
-                ]
-                if v is not None
-            )
-
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions,
-        )
-
-    return forward
-
-
-def gpt2_sequence_parallel_forward_fn(shard_config: ShardConfig):
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-            input_ids.shape[0]
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-            inputs_embeds.shape[0]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        if token_type_ids is not None:
-            token_type_ids = token_type_ids.view(-1, input_shape[-1])
-        if position_ids is not None:
-            position_ids = position_ids.view(-1, input_shape[-1])
-
-        if past_key_values is None:
-            past_length = 0
-            past_key_values = tuple([None] * len(self.h))
-        else:
-            past_length = past_key_values[0][0].size(-2)
-        if position_ids is None:
-            position_ids = torch.arange(
-                past_length,
-                input_shape[-1] + past_length,
-                dtype=torch.long,
-                device=device,
-            )
-            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # head_mask has shape n_layer x batch x n_heads x N x N
-        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.wte(input_ids)
-        position_embeds = self.wpe(position_ids)
-        hidden_states = inputs_embeds + position_embeds
-
-        if token_type_ids is not None:
-            token_type_embeds = self.wte(token_type_ids)
-            hidden_states = hidden_states + token_type_embeds
-
-        hidden_states = self.drop(hidden_states)
-
-        output_shape = input_shape + (hidden_states.size(-1),)
-        attention_mask, encoder_attention_mask = _get_attention_mask(
-            self,
-            shard_config,
-            hidden_states,
-            past_key_values,
-            attention_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger = logging.get_logger(__name__)
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
-        all_hidden_states = () if output_hidden_states else None
-
-        # split the input tensor along sequence dimension
-        # [batch_size, seq_len, hidden_size] -> [batch_size, seq_len/TP_size, hidden_size]
-        hidden_states = split_forward_gather_backward(
-            hidden_states,
-            dim=1,
-            process_group=shard_config.sequence_parallel_process_group,
-            fp8_communication=shard_config.fp8_communication,
-        )
-
-        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-            # Model parallel
-            if self.model_parallel:
-                torch.cuda.set_device(hidden_states.device)
-                # Ensure layer_past is on same device as hidden_states (might not be correct)
-                if layer_past is not None:
-                    layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past)
-                # Ensure that attention_mask is always on the same device as hidden_states
-                if torch.is_tensor(attention_mask):
-                    attention_mask = attention_mask.to(hidden_states.device)
-                if isinstance(head_mask, torch.Tensor):
-                    head_mask = head_mask.to(hidden_states.device)
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            if self.gradient_checkpointing and self.training:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        # None for past_key_value
-                        return module(*inputs, use_cache, output_attentions)
-
-                    return custom_forward
-
-                outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states,
-                    None,
-                    attention_mask,
-                    head_mask[i],
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                )
-            else:
-                outputs = block(
-                    hidden_states,
-                    layer_past=layer_past,
-                    attention_mask=attention_mask,
-                    head_mask=head_mask[i],
-                    encoder_hidden_states=encoder_hidden_states,
-                    encoder_attention_mask=encoder_attention_mask,
-                    use_cache=use_cache,
-                    output_attentions=output_attentions,
-                )
-
-            hidden_states = outputs[0]
-            if use_cache is True:
-                presents = presents + (outputs[1],)
-
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
-                if self.config.add_cross_attention:
-                    all_cross_attentions = all_cross_attentions + (outputs[3 if use_cache else 2],)
-
-            # Model Parallel: If it's the last layer for that device, put things on the next device
-            if self.model_parallel:
-                for k, v in self.device_map.items():
-                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
-                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
-
-        # When sequence parallelism done, gather the output tensor in forward and split it in backward
-        hidden_states = gather_forward_split_backward(
-            hidden_states,
-            dim=1,
-            process_group=shard_config.sequence_parallel_process_group,
-            fp8_communication=shard_config.fp8_communication,
-        )
-
-        hidden_states = self.ln_f(hidden_states)
-        hidden_states = hidden_states.view(output_shape)
-        # Add last hidden state
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(
-                v
-                for v in [
-                    hidden_states,
-                    presents,
-                    all_hidden_states,
-                    all_self_attentions,
-                    all_cross_attentions,
-                ]
-                if v is not None
-            )
-
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions,
-        )
-
-    return forward
-
-
-def get_lm_forward_with_dist_cross_entropy(shard_config: ShardConfig):
-    from transformers import GPT2LMHeadModel
-
-    def forward(
-        self: GPT2LMHeadModel,
-        input_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        token_type_ids: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
-            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
-            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        transformer_outputs = self.transformer(
-            input_ids,
-            past_key_values=past_key_values,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-
-        lm_logits = self.lm_head(hidden_states)
-        loss = dist_cross_entropy(
-            labels, lm_logits, shard_config, self.lm_head.out_features, self.config.vocab_size, self.transformer.dtype
-        )
-
-        if not return_dict:
-            output = (lm_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return CausalLMOutputWithCrossAttentions(
-            loss=loss,
-            logits=lm_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-            cross_attentions=transformer_outputs.cross_attentions,
-        )
-
-    return forward
-
-
 def get_jit_fused_gpt2_mlp_forward():
     from transformers.models.gpt2.modeling_gpt2 import GPT2MLP
 
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index 4e8f67407bc6..47c17e7494f2 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -25,7 +25,6 @@
 from transformers.utils import logging
 
 from colossalai.pipeline.stage_manager import PipelineStageManager
-from colossalai.shardformer.layer import AttnMaskType
 from colossalai.shardformer.layer._operation import all_to_all_comm, gather_sp_output, split_forward_gather_backward
 from colossalai.shardformer.layer.utils import is_share_sp_tp, split_batch_zigzag
 from colossalai.shardformer.shard import ShardConfig
@@ -58,10 +57,7 @@ def llama_model_forward(
         hidden_states: Optional[torch.FloatTensor] = None,
         stage_index: Optional[List[int]] = None,
         shard_config: ShardConfig = None,
-        # Split output only when computing cross entropy using llama_for_causal_lm_forward
-        # or get_lm_forward_with_dist_cross_entropy
-        # Default to True to avoid bug when calling classification forward from huggingface
-        force_sp_output_gather: bool = True,
+        force_sp_gather: bool = True,  # Set to false only when computing cross entropy
     ):
         logger = logging.get_logger(__name__)
 
@@ -78,8 +74,9 @@ def llama_model_forward(
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        disable_pp = stage_manager is None
         # retrieve input_ids and inputs_embeds
-        if stage_manager.is_first_stage():
+        if disable_pp or stage_manager.is_first_stage():
             if input_ids is not None and inputs_embeds is not None:
                 raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
             elif input_ids is not None:
@@ -88,10 +85,10 @@ def llama_model_forward(
                 batch_size, seq_length, _ = inputs_embeds.shape[:2]
             else:
                 raise ValueError("You have to specify either input_ids or inputs_embeds")
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
             if inputs_embeds is None:
                 inputs_embeds = self.embed_tokens(input_ids)
             hidden_states = inputs_embeds
+            device = hidden_states.device
         else:
             input_shape = hidden_states.shape[:-1]
             batch_size, seq_length = input_shape
@@ -101,8 +98,8 @@ def llama_model_forward(
         sp_mode = shard_config.sequence_parallelism_mode
         sp_group = shard_config.sequence_parallel_process_group
         sp_size = shard_config.sequence_parallel_size
-        if sp_mode == "all_to_all" and not stage_manager.is_first_stage():
-            # For generating full positions ids, as the states will be gather along the seq dim in the attention layer later.
+        # Generating full positions ids for modes that gather sequence before attn
+        if stage_manager and (sp_mode != "ring_attn" and not stage_manager.is_first_stage()):
             seq_length *= sp_size
 
         past_seen_tokens = 0
@@ -117,7 +114,6 @@ def llama_model_forward(
 
         seq_length_with_past = seq_length + past_seen_tokens
 
-        # TODO(jianghai): left the recording kv-value tensors as () or None type, this feature may be added in the future.
         if output_attentions:
             logger.warning_once("output_attentions=True is not supported for pipeline models at the moment.")
             output_attentions = False
@@ -130,14 +126,13 @@ def llama_model_forward(
 
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
-        # embed positions, for the first stage, hidden_states is the input embeddings,
-        # for the other stages, hidden_states is the output of the previous stage
-        if not stage_manager.is_first_stage() and sp_mode == "ring_attn":
+
+        no_split_input = disable_pp or not stage_manager.is_first_stage()
+        if no_split_input and sp_mode == "ring_attn":
             _, attn_kwargs, _ = RingAttention.prepare_varlen_batch(attention_mask, sp_group)
         elif shard_config.enable_flash_attention:
-            # in this case, attention_mask is a dict rather than a tensor
             mask_shape = (batch_size, 1, seq_length_with_past, seq_length_with_past)
-            attn_kwargs = ColoAttention.prepare_attn_kwargs(
+            attn_kwargs: dict = ColoAttention.prepare_attn_kwargs(
                 mask_shape,
                 hidden_states.dtype,
                 hidden_states.device,
@@ -146,15 +141,15 @@ def llama_model_forward(
                 invert=(sp_mode != "ring_attn"),
             )
         else:
-            attn_kwargs = self._update_causal_mask(attention_mask, hidden_states, cache_position)
+            attn_kwargs: torch.Tensor = self._update_causal_mask(attention_mask, hidden_states, cache_position)
 
-        # Support SP + PP
-        # TODO: support padded casual cu_seqlens across stages
-        if stage_manager.is_first_stage():
+        # Support SP + PP. Later stages have already received the split input.
+        split_input = disable_pp or stage_manager.is_first_stage()
+        if split_input:
             # Ring Attention zigzag batch processing
             if sp_mode == "ring_attn":
                 assert shard_config.enable_flash_attention, "Ring Attention inherently requires Flash Attention."
-                if attn_kwargs["attention_mask_type"] == AttnMaskType.PADDED_CAUSAL:
+                if not attention_mask.bool().all():
                     hidden_states, attn_kwargs, position_ids = RingAttention.prepare_varlen_batch(
                         attention_mask, sp_group, hidden_states, position_ids
                     )
@@ -181,8 +176,8 @@ def llama_model_forward(
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
         next_decoder_cache = None
+        start_idx, end_idx = (0, len(self.layers)) if disable_pp else (stage_index[0], stage_index[1])
 
-        start_idx, end_idx = stage_index[0], stage_index[1]
         num_ckpt_layers = 0
         if self.gradient_checkpointing and self.training:
             num_ckpt_layers = end_idx - start_idx
@@ -228,18 +223,16 @@ def llama_model_forward(
             if output_attentions:
                 all_self_attns += (layer_outputs[1],)
 
-        if stage_manager.is_last_stage():
+        if disable_pp or stage_manager.is_last_stage():
             hidden_states = self.norm(hidden_states)
-            if (not shard_config.parallel_output) or force_sp_output_gather or is_share_sp_tp(sp_mode):
-                hidden_states = gather_sp_output(
-                    hidden_states, sp_group, sp_mode, fp8_communication=shard_config.fp8_communication
-                )
+            if (not shard_config.parallel_output) or force_sp_gather or is_share_sp_tp(sp_mode):  # noqa
+                hidden_states = gather_sp_output(hidden_states, shard_config)
 
         # add hidden states from the last decoder layer
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
         next_cache = next_decoder_cache if use_cache else None
-        if stage_manager.is_last_stage():
+        if disable_pp or stage_manager.is_last_stage():
             if not return_dict:
                 return tuple(
                     v
@@ -257,7 +250,7 @@ def llama_model_forward(
                 hidden_states=all_hidden_states,
                 attentions=all_self_attns,
             )
-        # always return dict for imediate stage
+        # always return dict for intermediate stage
         return {"hidden_states": hidden_states}
 
     @staticmethod
@@ -323,7 +316,7 @@ def llama_for_causal_lm_forward(
             # Split labels in a zigzag fashion too
             sp_group = shard_config.sequence_parallel_process_group
             if attention_mask.bool().all():
-                labels = split_batch_zigzag(labels, sp_group, seq_dim=1)
+                labels = split_batch_zigzag(labels, sp_group, seq_dim=1, is_label=True)
             else:
                 # [B, max_seqlen // sp_size]
                 labels, _, _ = RingAttention.prepare_varlen_batch(attention_mask, sp_group, labels, is_label=True)
@@ -345,16 +338,17 @@ def llama_for_causal_lm_forward(
             hidden_states=hidden_states,
             stage_index=stage_index,
             shard_config=shard_config,
-            force_sp_output_gather=False,
+            force_sp_gather=False,
         )
         past_key_values = None
 
-        if stage_manager.is_last_stage():
+        disable_pp = stage_manager is None
+        if disable_pp or stage_manager.is_last_stage():
             hidden_states = outputs[0]
             logits = self.lm_head(hidden_states)
-            loss = dist_cross_entropy(
-                labels, logits, shard_config, self.lm_head.out_features, self.config.vocab_size, self.model.dtype
-            )
+            loss = None
+            if labels is not None:
+                loss = dist_cross_entropy(labels, logits, shard_config, self.lm_head.out_features, self.model.dtype)
 
             if not return_dict:
                 output = (logits,) + outputs[1:]
@@ -629,263 +623,3 @@ def forward(
         return attn_output, attn_weights, past_key_value
 
     return forward
-
-
-def get_llama_flash_attention_model_forward(shard_config: ShardConfig, sp_mode=None, sp_size=None, sp_group=None):
-    logger = logging.get_logger(__name__)
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        # Split output only when computing cross entropy using llama_for_causal_lm_forward
-        # or get_lm_forward_with_dist_cross_entropy
-        # Default to True to avoid bug when calling classification forward from huggingface
-        force_sp_output_gather: bool = True,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError(
-                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
-            )
-
-        if (self.gradient_checkpointing or sp_mode in ["ring", "all_to_all"]) and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                )
-                use_cache = False
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        past_seen_tokens = 0
-        seq_len = inputs_embeds.shape[1]
-        batch_size = inputs_embeds.shape[0]
-        if use_cache:  # kept for BC (cache positions)
-            if not isinstance(past_key_values, StaticCache):
-                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-                past_seen_tokens = past_key_values.get_seq_length()
-
-        if cache_position is None:
-            if isinstance(past_key_values, StaticCache):
-                raise ValueError("cache_position is a required argument when using StaticCache.")
-            cache_position = torch.arange(past_seen_tokens, past_seen_tokens + seq_len, device=inputs_embeds.device)
-        if position_ids is None:
-            position_ids = cache_position.unsqueeze(0)
-
-        if shard_config.enable_flash_attention:
-            mask_shape = (batch_size, 1, seq_len, past_seen_tokens + seq_len)
-            attn_kwargs: dict = ColoAttention.prepare_attn_kwargs(
-                mask_shape,
-                inputs_embeds.dtype,
-                inputs_embeds.device,
-                q_padding_mask=attention_mask,
-                is_causal=True,
-                invert=(sp_mode != "ring_attn"),
-            )
-
-        else:
-            attn_kwargs: torch.Tensor = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
-
-        # Ring Attention zigzag batch processing
-        if sp_mode == "ring_attn":
-            assert shard_config.enable_flash_attention, "Ring Attention inherently requires Flash Attention."
-            if attn_kwargs["attention_mask_type"] == AttnMaskType.PADDED_CAUSAL:
-                inputs_embeds, attn_kwargs, position_ids = RingAttention.prepare_varlen_batch(
-                    attention_mask, sp_group, inputs_embeds, position_ids
-                )
-            else:
-                inputs_embeds, position_ids = split_batch_zigzag([inputs_embeds, position_ids], sp_group)
-                attn_kwargs = {"attention_mask_type": attn_kwargs["attention_mask_type"]}  # drop redundant tensors
-
-        elif is_share_sp_tp(sp_mode):
-            inputs_embeds = split_forward_gather_backward(
-                inputs_embeds, 1, sp_group, fp8_communication=shard_config.fp8_communication
-            )
-        elif sp_mode == "all_to_all":
-            inputs_embeds = split_forward_gather_backward(
-                inputs_embeds, 1, sp_group, 1 / sp_size, fp8_communication=shard_config.fp8_communication
-            )
-        hidden_states = inputs_embeds
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        next_decoder_cache = None
-
-        for decoder_layer in self.layers:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    attn_kwargs,
-                    position_ids,
-                    past_key_values,
-                    output_attentions,
-                    use_cache,
-                    cache_position,
-                )
-
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attn_kwargs,
-                    position_ids=position_ids,
-                    past_key_value=past_key_values,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                    cache_position=cache_position,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if use_cache:
-                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-        # Cases that don't support parallelizing cross entropy computation along sequence
-        if (not shard_config.parallel_output) or is_share_sp_tp(sp_mode) or force_sp_output_gather:
-            hidden_states = gather_sp_output(
-                hidden_states, sp_group, sp_mode, fp8_communication=shard_config.fp8_communication
-            )
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        next_cache = None
-        if use_cache:
-            next_cache = (
-                next_decoder_cache.to_legacy_cache() if isinstance(next_decoder_cache, Cache) else next_decoder_cache
-            )
-        if not return_dict:
-            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
-
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
-
-    return forward
-
-
-def get_lm_forward_with_dist_cross_entropy(shard_config: ShardConfig):
-    from transformers import LlamaForCausalLM
-
-    def forward(
-        self: LlamaForCausalLM,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, LlamaForCausalLM
-
-        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
-        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
-
-        >>> prompt = "Hey, are you conscious? Can you talk to me?"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-        ```"""
-
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if shard_config.sequence_parallelism_mode == "ring_attn" and shard_config.parallel_output:
-            # Special processing: Split labels in a zigzag fashion too
-            sp_group = shard_config.sequence_parallel_process_group
-            if attention_mask.bool().all():
-                labels = split_batch_zigzag(labels, sp_group, seq_dim=1, is_label=True)
-            else:
-                # [B, max_seq_len // sp_size]
-                labels, _, _ = RingAttention.prepare_varlen_batch(attention_mask, sp_group, labels, is_label=True)
-
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            cache_position=cache_position,
-            force_sp_output_gather=False,
-        )
-
-        hidden_states = outputs[0]
-        if self.config.pretraining_tp > 1:
-            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
-            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
-            logits = torch.cat(logits, dim=-1)
-        else:
-            logits = self.lm_head(hidden_states)
-        logits = logits.float()
-        loss = dist_cross_entropy(
-            labels, logits, shard_config, self.lm_head.out_features, self.config.vocab_size, self.model.dtype
-        )
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return CausalLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    return forward
diff --git a/colossalai/shardformer/modeling/mistral.py b/colossalai/shardformer/modeling/mistral.py
index ec1a8a00a58a..7fc6a1062037 100644
--- a/colossalai/shardformer/modeling/mistral.py
+++ b/colossalai/shardformer/modeling/mistral.py
@@ -274,10 +274,9 @@ def mistral_for_causal_lm_forward(
             hidden_states = outputs[0]
             logits = self.lm_head(hidden_states)
             logits = logits.float()
-
-            loss = dist_cross_entropy(
-                labels, logits, shard_config, self.lm_head.out_features, self.config.vocab_size, self.model.dtype
-            )
+            loss = None
+            if labels is not None:
+                loss = dist_cross_entropy(labels, logits, shard_config, self.lm_head.out_features, self.model.dtype)
 
             if not return_dict:
                 output = (logits,) + outputs[1:]
@@ -687,10 +686,9 @@ def forward(
         hidden_states = outputs[0]
         logits = self.lm_head(hidden_states)
         logits = logits.float()
-
-        loss = dist_cross_entropy(
-            labels, logits, shard_config, self.lm_head.out_features, self.config.vocab_size, self.model.dtype
-        )
+        loss = None
+        if labels is not None:
+            loss = dist_cross_entropy(labels, logits, shard_config, self.lm_head.out_features, self.model.dtype)
 
         if not return_dict:
             output = (logits,) + outputs[1:]
diff --git a/colossalai/shardformer/modeling/opt.py b/colossalai/shardformer/modeling/opt.py
index 636b46cc461d..3ea4db9e2f70 100644
--- a/colossalai/shardformer/modeling/opt.py
+++ b/colossalai/shardformer/modeling/opt.py
@@ -330,14 +330,15 @@ def opt_for_causal_lm_forward(
         )
         if stage_manager.is_last_stage():
             logits = self.lm_head(outputs[0]).contiguous()
-            loss = dist_cross_entropy(
-                labels,
-                logits,
-                shard_config,
-                self.lm_head.out_features,
-                self.config.vocab_size,
-                self.model.decoder.dtype,
-            )
+            loss = None
+            if labels is not None:
+                loss = dist_cross_entropy(
+                    labels,
+                    logits,
+                    shard_config,
+                    self.lm_head.out_features,
+                    self.model.decoder.dtype,
+                )
 
             if not return_dict:
                 output = (logits,) + outputs[1:]
@@ -955,9 +956,9 @@ def forward(
         )
 
         logits = self.lm_head(outputs[0]).contiguous()
-        loss = dist_cross_entropy(
-            labels, logits, shard_config, self.lm_head.out_features, self.config.vocab_size, self.model.decoder.dtype
-        )
+        loss = None
+        if labels is not None:
+            loss = dist_cross_entropy(labels, logits, shard_config, self.lm_head.out_features, self.model.decoder.dtype)
 
         if not return_dict:
             output = (logits,) + outputs[1:]
diff --git a/colossalai/shardformer/modeling/qwen2.py b/colossalai/shardformer/modeling/qwen2.py
index a61360d17570..569fc4a459c5 100644
--- a/colossalai/shardformer/modeling/qwen2.py
+++ b/colossalai/shardformer/modeling/qwen2.py
@@ -32,14 +32,12 @@
 from transformers.utils import logging
 
 from colossalai.pipeline.stage_manager import PipelineStageManager
-from colossalai.shardformer.layer._operation import (
-    all_to_all_comm,
-    gather_forward_split_backward,
-    split_forward_gather_backward,
-)
+from colossalai.shardformer.layer._operation import all_to_all_comm, split_forward_gather_backward
 from colossalai.shardformer.shard import ShardConfig
 
 from ..layer import ColoAttention, dist_cross_entropy
+from ..layer._operation import gather_sp_output
+from ..layer.utils import is_share_sp_tp
 
 
 class Qwen2PipelineForwards:
@@ -64,6 +62,7 @@ def qwen2_model_forward(
         hidden_states: Optional[torch.FloatTensor] = None,
         stage_index: Optional[List[int]] = None,
         shard_config: ShardConfig = None,
+        force_sp_output_gather: bool = True,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         logger = logging.get_logger(__name__)
 
@@ -115,6 +114,14 @@ def qwen2_model_forward(
             past_key_values_length = past_key_values[0][0].shape[2]
             seq_length_with_past = seq_length_with_past + past_key_values_length
 
+        # Support SP + PP
+        sp_size = shard_config.sequence_parallel_size
+        sp_group = shard_config.sequence_parallel_process_group
+        sp_mode = shard_config.sequence_parallelism_mode
+        # For generating full positions ids (the states will be gathered along the seq dim before attention fwd).
+        if sp_mode != "ring_attn" and not stage_manager.is_first_stage():
+            seq_length *= sp_size
+
         if position_ids is None:
             device = input_ids.device if input_ids is not None else inputs_embeds.device
             position_ids = torch.arange(
@@ -151,7 +158,6 @@ def qwen2_model_forward(
             elif self._attn_implementation == "sdpa" and not output_attentions:
                 # output_attentions=True can not be supported when using SDPA, and we fall back on
                 # the manual implementation that requires a 4D causal mask in all cases.
-
                 attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
                     attention_mask,
                     (batch_size, seq_length),
@@ -160,7 +166,6 @@ def qwen2_model_forward(
                 )
             else:
                 # 4d mask is passed through the layers
-
                 attention_mask = _prepare_4d_causal_attention_mask(
                     attention_mask,
                     (batch_size, seq_length),
@@ -169,22 +174,21 @@ def qwen2_model_forward(
                     sliding_window=self.config.sliding_window,
                 )
 
-        if shard_config and shard_config.enable_sequence_parallelism:
-            if shard_config.sequence_parallelism_mode in ["split_gather", "ring"]:
-                hidden_states = split_forward_gather_backward(
-                    hidden_states,
-                    dim=1,
-                    process_group=shard_config.tensor_parallel_process_group,
-                    fp8_communication=shard_config.fp8_communication,
-                )
-            elif shard_config.sequence_parallelism_mode == "all_to_all":
-                hidden_states = split_forward_gather_backward(
-                    hidden_states,
-                    dim=1,
-                    process_group=shard_config.sequence_parallel_process_group,
-                    grad_scale=1 / shard_config.sequence_parallel_size,
-                    fp8_communication=shard_config.fp8_communication,
-                )
+        if stage_manager.is_first_stage():
+            if shard_config.enable_sequence_parallelism:
+                if is_share_sp_tp(sp_mode):
+                    hidden_states = split_forward_gather_backward(
+                        hidden_states,
+                        dim=1,
+                        process_group=sp_group,
+                    )
+                elif sp_mode == "all_to_all":
+                    hidden_states = split_forward_gather_backward(
+                        hidden_states,
+                        dim=1,
+                        process_group=sp_group,
+                        grad_scale=1 / sp_size,
+                    )
 
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
@@ -241,23 +245,10 @@ def qwen2_model_forward(
 
         if stage_manager.is_last_stage():
             hidden_states = self.norm(hidden_states)
+            if shard_config.enable_sequence_parallelism:
+                if (not shard_config.parallel_output) or force_sp_output_gather or is_share_sp_tp(sp_mode):
+                    hidden_states = gather_sp_output(hidden_states, shard_config)
 
-        if shard_config and shard_config.enable_sequence_parallelism:
-            if shard_config.sequence_parallelism_mode in ["split_gather", "ring"]:
-                hidden_states = gather_forward_split_backward(
-                    hidden_states,
-                    dim=1,
-                    process_group=shard_config.tensor_parallel_process_group,
-                    fp8_communication=shard_config.fp8_communication,
-                )
-            elif shard_config.sequence_parallelism_mode == "all_to_all":
-                hidden_states = gather_forward_split_backward(
-                    hidden_states,
-                    dim=1,
-                    process_group=shard_config.sequence_parallel_process_group,
-                    grad_scale=shard_config.sequence_parallel_size,
-                    fp8_communication=shard_config.fp8_communication,
-                )
         # add hidden states from the last decoder layer
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
@@ -351,15 +342,18 @@ def qwen2_for_causal_lm_forward(
             hidden_states=hidden_states,
             stage_index=stage_index,
             shard_config=shard_config,
+            force_sp_output_gather=False,
         )
         past_key_values = None
 
         if stage_manager.is_last_stage():
             hidden_states = outputs[0]
+            if hidden_states.shape[1] == 2:
+                pass
             logits = self.lm_head(hidden_states)
-            loss = dist_cross_entropy(
-                labels, logits, shard_config, self.lm_head.out_features, self.config.vocab_size, logits.dtype
-            )
+            loss = None
+            if labels is not None:
+                loss = dist_cross_entropy(labels, logits, shard_config, self.lm_head.out_features, logits.dtype)
 
             if not return_dict:
                 output = (logits,) + outputs[1:]
@@ -541,7 +535,6 @@ def forward(
         # Because the input can be padded, the absolute sequence length depends on the max position id.
         rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
         cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
-
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
         if past_key_value is not None:
@@ -635,6 +628,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        force_sp_output_gather: bool = True,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -750,14 +744,9 @@ def forward(
 
         hidden_states = self.norm(hidden_states)
 
-        if sp_mode == "ring" or sp_mode == "split_gather":
-            hidden_states = gather_forward_split_backward(
-                hidden_states, 1, sp_group, fp8_communication=shard_config.fp8_communication
-            )
-        elif sp_mode == "all_to_all":
-            hidden_states = gather_forward_split_backward(
-                hidden_states, 1, sp_group, grad_scale=sp_size, fp8_communication=shard_config.fp8_communication
-            )
+        if shard_config.enable_sequence_parallelism:
+            if (not shard_config.parallel_output) or force_sp_output_gather or is_share_sp_tp(sp_mode):
+                hidden_states = gather_sp_output(hidden_states, shard_config)
 
         # add hidden states from the last decoder layer
         if output_hidden_states:
@@ -834,14 +823,15 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            force_sp_output_gather=False,
         )
 
         hidden_states = outputs[0]
         logits = self.lm_head(hidden_states)
         logits = logits.float()
-        loss = dist_cross_entropy(
-            labels, logits, shard_config, self.lm_head.out_features, self.config.vocab_size, logits.dtype
-        )
+        loss = None
+        if labels is not None:
+            loss = dist_cross_entropy(labels, logits, shard_config, self.lm_head.out_features, logits.dtype)
 
         if not return_dict:
             output = (logits,) + outputs[1:]
diff --git a/colossalai/shardformer/policies/chatglm2.py b/colossalai/shardformer/policies/chatglm2.py
index 16c13085ade1..1b7d2db85991 100644
--- a/colossalai/shardformer/policies/chatglm2.py
+++ b/colossalai/shardformer/policies/chatglm2.py
@@ -64,7 +64,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
 
         if sp_mode == "ring":
             warnings.warn(
-                f"For ChatGLM2, sequence parallelism is currently not support mode {sp_mode}, will set to be split_gather"
+                f"For ChatGLM2, sequence parallelism doesn't support mode {sp_mode} yet, will set to be split_gather"
             )
             sp_mode = "split_gather"
         overlap = self.shard_config.enable_sequence_overlap
diff --git a/colossalai/shardformer/policies/gpt2.py b/colossalai/shardformer/policies/gpt2.py
index 0a1949d85c74..d9233be9a822 100644
--- a/colossalai/shardformer/policies/gpt2.py
+++ b/colossalai/shardformer/policies/gpt2.py
@@ -6,14 +6,7 @@
 
 import colossalai.shardformer.layer as col_nn
 
-from ..modeling.gpt2 import (
-    GPT2PipelineForwards,
-    get_gpt2_flash_attention_forward,
-    get_gpt_model_forward_for_flash_attn,
-    get_jit_fused_gpt2_mlp_forward,
-    get_lm_forward_with_dist_cross_entropy,
-    gpt2_sequence_parallel_forward_fn,
-)
+from ..modeling.gpt2 import GPT2PipelineForwards, get_gpt2_flash_attention_forward, get_jit_fused_gpt2_mlp_forward
 from .base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
 
 __all__ = [
@@ -71,18 +64,10 @@ def module_policy(self):
             warnings.warn(
                 f"For GPT2, sequence parallelism is currently not support mode {sp_mode}, will set to be split_gather"
             )
-            sp_mode = "split_gather"
+            self.shard_config.sequence_parallelism_mode = sp_mode = "split_gather"
         overlap = self.shard_config.enable_sequence_overlap
         sp_partial_derived = sp_mode in ["split_gather", "ring"]
         use_flash_attention = self.shard_config.enable_flash_attention
-        # todo: currently sp cannot be used with flashattention
-        if sp_mode in ["split_gather", "ring", "all_to_all"]:
-            if use_flash_attention:
-                warnings.warn(
-                    f"Sequence parallelism mode {sp_mode} cannot be used with FlashAttention, will disable FlashAttention automatically."
-                )
-                self.shard_config.enable_flash_attention = False
-                use_flash_attention = False
         if self.shard_config.enable_tensor_parallelism:
             assert (
                 self.model.config.num_attention_heads % self.shard_config.tensor_parallel_size == 0
@@ -211,18 +196,16 @@ def module_policy(self):
         if use_flash_attention:
             self.append_or_create_method_replacement(
                 description={
-                    "forward": get_gpt2_flash_attention_forward(),
+                    "forward": get_gpt2_flash_attention_forward(shard_config=self.shard_config),
                 },
                 policy=policy,
                 target_key=attn_cls,
             )
-            if not self.shard_config.pipeline_stage_manager:
-                policy[GPT2Model].method_replacement = {
-                    "forward": get_gpt_model_forward_for_flash_attn(self.shard_config)
-                }
 
-        if sp_mode is not None:
-            policy[GPT2Model].method_replacement = {"forward": gpt2_sequence_parallel_forward_fn(self.shard_config)}
+        if not self.shard_config.pipeline_stage_manager and self.shard_config.enable_sequence_parallelism:
+            policy[GPT2Model].method_replacement = {
+                "forward": partial(GPT2PipelineForwards.gpt2_model_forward, shard_config=self.shard_config)
+            }
 
         return policy
 
@@ -328,40 +311,39 @@ def module_policy(self):
         from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
 
         module_policy = super().module_policy()
-
+        module_policy[GPT2LMHeadModel] = ModulePolicyDescription()
         if self.shard_config.enable_tensor_parallelism:
-            addon_module = {
-                GPT2LMHeadModel: ModulePolicyDescription(
-                    sub_module_replacement=[
-                        SubModuleReplacementDescription(
-                            suffix="lm_head",
-                            target_module=col_nn.VocabParallelLMHead1D,
-                            kwargs={
-                                "gather_output": False,
-                                "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
-                                "fp8_communication": self.shard_config.fp8_communication,
-                            },
-                        )
-                    ],
-                )
-            }
-            if self.shard_config.parallel_output:
-                addon_module[GPT2LMHeadModel].method_replacement = {
-                    "forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)
-                }
+            self.append_or_create_submodule_replacement(
+                description=SubModuleReplacementDescription(
+                    suffix="lm_head",
+                    target_module=col_nn.VocabParallelLMHead1D,
+                    kwargs={
+                        "gather_output": False,
+                        "make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by,
+                    },
+                ),
+                policy=module_policy,
+                target_key=GPT2LMHeadModel,
+            )
         else:
-            addon_module = {
-                GPT2LMHeadModel: ModulePolicyDescription(
-                    sub_module_replacement=[
-                        SubModuleReplacementDescription(
-                            suffix="lm_head",
-                            target_module=col_nn.PaddingLMHead,
-                            kwargs={"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by},
-                        )
-                    ]
-                )
-            }
-        module_policy.update(addon_module)
+            self.append_or_create_submodule_replacement(
+                description=SubModuleReplacementDescription(
+                    suffix="lm_head",
+                    target_module=col_nn.PaddingLMHead,
+                    kwargs={"make_vocab_size_divisible_by": self.shard_config.make_vocab_size_divisible_by},
+                ),
+                policy=module_policy,
+                target_key=GPT2LMHeadModel,
+            )
+
+        if self.shard_config.parallel_output:
+            self.append_or_create_method_replacement(
+                description={
+                    "forward": partial(GPT2PipelineForwards.gpt2_lmhead_model_forward, shard_config=self.shard_config)
+                },
+                policy=module_policy,
+                target_key=GPT2LMHeadModel,
+            )
 
         if self.pipeline_stage_manager is not None:
             self.set_pipeline_forward(
diff --git a/colossalai/shardformer/policies/llama.py b/colossalai/shardformer/policies/llama.py
index 78ac5eaaeb79..ec517da4966f 100644
--- a/colossalai/shardformer/policies/llama.py
+++ b/colossalai/shardformer/policies/llama.py
@@ -16,12 +16,7 @@
     VocabParallelLMHead1D,
 )
 
-from ..modeling.llama import (
-    LlamaPipelineForwards,
-    get_llama_flash_attention_forward,
-    get_llama_flash_attention_model_forward,
-    get_lm_forward_with_dist_cross_entropy,
-)
+from ..modeling.llama import LlamaPipelineForwards, get_llama_flash_attention_forward
 from .base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
 
 __all__ = ["LlamaPolicy", "LlamaForCausalLMPolicy", "LlamaForSequenceClassificationPolicy"]
@@ -99,11 +94,9 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
         if self.pipeline_stage_manager is None:
             self.append_or_create_method_replacement(
                 description={
-                    "forward": get_llama_flash_attention_model_forward(
-                        self.shard_config,
-                        sp_mode=sp_mode,
-                        sp_size=sp_size,
-                        sp_group=sp_group,
+                    "forward": partial(
+                        LlamaPipelineForwards.llama_model_forward,
+                        shard_config=self.shard_config,
                     ),
                 },
                 policy=policy,
@@ -351,7 +344,8 @@ def module_policy(self):
         elif self.shard_config.enable_tensor_parallelism or self.shard_config.enable_sequence_parallelism:
             # Compute loss distributedly along the sequence dimension
             new_item[LlamaForCausalLM].method_replacement = {
-                "forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)
+                # "forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)
+                "forward": partial(LlamaPipelineForwards.llama_for_causal_lm_forward, shard_config=self.shard_config)
             }
         return policy
 
diff --git a/examples/language/gpt/hybridparallelism/benchmark.py b/examples/language/gpt/hybridparallelism/benchmark.py
index 8c236b524c26..91b9e6c04950 100644
--- a/examples/language/gpt/hybridparallelism/benchmark.py
+++ b/examples/language/gpt/hybridparallelism/benchmark.py
@@ -28,7 +28,7 @@
     "118M": GPT2Config(activation_function="gelu"),
     "338M": GPT2Config(n_embd=1024, n_head=16, n_layer=24, activation_function="gelu"),
     "738M": GPT2Config(n_embd=1280, n_head=20, n_layer=36, activation_function="gelu"),
-    "6.21B": GPT2Config(n_embd=4096, n_head=32, n_layer=32, n_positions=4096, activation_function="gelu"),
+    "6.21B": GPT2Config(n_embd=4096, n_head=32, n_layer=32, n_positions=32768, activation_function="gelu"),
 }
 
 
@@ -60,6 +60,8 @@ def main():
     parser.add_argument("--tp", type=int, default=1, help="Tensor parallel size")
     parser.add_argument("--extra_dp", type=int, default=1, help="Extra data parallel size, used for Gemini")
     parser.add_argument("--pp", type=int, default=1, help="Pipeline parallel size")
+    parser.add_argument("--sp", type=int, default=1, help="Sequence parallel size")
+    parser.add_argument("--sp_mode", type=str, default="ring_attn", help="Sequence parallel mode")
     parser.add_argument("--mbs", type=int, default=1)
     parser.add_argument("--zero", type=int, default=0)
     parser.add_argument("--pp_style", type=str, default="1f1b")
@@ -129,6 +131,9 @@ def empty_init():
             tp_size=args.tp,
             pp_size=args.pp,
             pp_style=args.pp_style,
+            sp_size=args.sp,
+            sequence_parallelism_mode=args.sp_mode,
+            enable_sequence_parallelism=True,
             zero_stage=args.zero,
             num_model_chunks=args.num_model_chunks,
             enable_all_optimization=True,
@@ -214,6 +219,8 @@ def empty_init():
             performance_evaluator.on_step_start(step)
             outputs = model(**batch)
             loss = outputs[0]
+            del outputs
+
             booster.backward(loss, optimizer)
             optimizer.step()
             optimizer.zero_grad()
diff --git a/examples/language/performance_evaluator.py b/examples/language/performance_evaluator.py
index f5ad1d23d2a7..65c7e49a2f03 100644
--- a/examples/language/performance_evaluator.py
+++ b/examples/language/performance_evaluator.py
@@ -6,7 +6,6 @@
 from torch import Tensor
 from torch.profiler import ProfilerActivity, profile, schedule, tensorboard_trace_handler
 
-from colossalai.accelerator import get_accelerator
 from colossalai.cluster import DistCoordinator
 
 
@@ -22,8 +21,11 @@ def divide(x: float, y: float) -> float:
 def all_reduce_mean(x: float, world_size: int) -> float:
     if world_size == 1:
         return x
-    tensor = torch.tensor([x], device=get_accelerator().get_current_device())
-    dist.all_reduce(tensor)
+
+    # Use CPU tensor to avoid OOM/weird NCCl error
+    gloo_group = dist.new_group(backend="gloo")
+    tensor = torch.tensor([x], device="cpu")
+    dist.all_reduce(tensor, group=gloo_group)
     tensor = tensor / world_size
     return tensor.item()
 
diff --git a/tests/kit/model_zoo/transformers/gpt.py b/tests/kit/model_zoo/transformers/gpt.py
index f71776b6b4e0..f2b139beca83 100644
--- a/tests/kit/model_zoo/transformers/gpt.py
+++ b/tests/kit/model_zoo/transformers/gpt.py
@@ -27,7 +27,16 @@ def data_gen_for_lm():
     # LM data gen
     # the `labels` of LM is the token of the output, cause no padding, use `input_ids` as `labels`
     data = data_gen()
-    data["labels"] = data["input_ids"].clone()
+
+    # Test padded sequence for Ring Attention
+    padding = torch.zeros(1, data["input_ids"].shape[1] // 2, dtype=torch.long)
+    data["input_ids"] = torch.cat([data["input_ids"], padding], dim=1)
+    data["attention_mask"] = torch.cat([data["attention_mask"], padding], dim=1)
+
+    ignore_idx = -100
+    labels = data["input_ids"].clone()
+    labels[~data["attention_mask"].bool()] = ignore_idx
+    data["labels"] = labels
     return data
 
 
diff --git a/tests/test_shardformer/test_model/_utils.py b/tests/test_shardformer/test_model/_utils.py
index 9ad84341ac9e..0f9ec601387b 100644
--- a/tests/test_shardformer/test_model/_utils.py
+++ b/tests/test_shardformer/test_model/_utils.py
@@ -157,7 +157,6 @@ def build_model_from_hybrid_plugin(
         sharded_optimizer = sharded_optim_class(sharded_model.parameters(), lr=1e-3)
 
     criterion = loss_fn
-
     plugin = pluggin_cls(**test_config)
     booster = Booster(plugin=plugin)
 
@@ -323,7 +322,6 @@ def check_output_hidden_state(
         sp_size = shard_config.sequence_parallel_size
         if org_hidden_state.shape[seq_dim] == sharded_hidden_state.shape[seq_dim] * sp_size:
             org_hidden_state = org_hidden_state.chunk(sp_size, dim=seq_dim)[dist.get_rank(sp_group)]
-
     assert_close(org_hidden_state.float(), sharded_hidden_state.float(), atol=atol, rtol=rtol)
 
 
diff --git a/tests/test_shardformer/test_model/test_shard_chatglm2.py b/tests/test_shardformer/test_model/test_shard_chatglm2.py
index 92c077950ecc..17a8bf318976 100644
--- a/tests/test_shardformer/test_model/test_shard_chatglm2.py
+++ b/tests/test_shardformer/test_model/test_shard_chatglm2.py
@@ -136,26 +136,25 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
 @parameterize(
     "test_config",
     [
-        {  # Ulysess + Flash attention
-            "tp_size": 1,
+        {
+            "tp_size": 2,
             "pp_size": 2,
-            "sp_size": 2,
             "num_microbatches": 2,
             "enable_sequence_parallelism": True,
-            "sequence_parallelism_mode": "all_to_all",
+            "sequence_parallelism_mode": "split_gather",
             "enable_flash_attention": True,
             "use_lazy_init": True,
             "zero_stage": 1,
             "precision": "fp16",
             "initial_scale": 1,
         },
-        {
-            "tp_size": 2,
+        {  # Ulysess + Flash attention
+            "tp_size": 1,
             "pp_size": 2,
             "sp_size": 2,
             "num_microbatches": 2,
             "enable_sequence_parallelism": True,
-            "sequence_parallelism_mode": "split_gather",
+            "sequence_parallelism_mode": "all_to_all",
             "enable_flash_attention": True,
             "use_lazy_init": True,
             "zero_stage": 1,
@@ -174,17 +173,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "precision": "fp16",
             "initial_scale": 1,
         },
-        {
-            "tp_size": 4,
-            "pp_size": 1,
-            "num_microbatches": 1,
-            "enable_sequence_parallelism": True,
-            "sequence_parallelism_mode": "ring",
-            "enable_flash_attention": False,
-            "use_lazy_init": True,
-            "precision": "fp32",
-            "initial_scale": 1,
-        },
         {
             "tp_size": 4,
             "pp_size": 1,
@@ -248,7 +236,11 @@ def run_chatglm_test(test_config):
         loss_fn,
         _,
     ) in sub_model_zoo.items():
-        check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
+        try:
+            check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
+        except Exception as e:
+            print(f"Test config failed for model {name}: {test_config}")
+            raise e
 
     clear_layout_converter()
     torch.cuda.empty_cache()
diff --git a/tests/test_shardformer/test_model/test_shard_command.py b/tests/test_shardformer/test_model/test_shard_command.py
index efe5cee2a2b6..9435ef84bfa8 100644
--- a/tests/test_shardformer/test_model/test_shard_command.py
+++ b/tests/test_shardformer/test_model/test_shard_command.py
@@ -125,7 +125,14 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             atol, rtol = 5e-3, 5e-3
 
         if org_model.__class__.__name__ == "CohereModel":
-            check_output_hidden_state(org_output, sharded_output, stage_manager, atol=atol, rtol=rtol)
+            check_output_hidden_state(
+                org_output,
+                sharded_output,
+                stage_manager,
+                atol=atol,
+                rtol=rtol,
+                shard_config=booster.plugin.shard_config,
+            )
 
         check_loss(org_loss, sharded_loss, atol=atol, rtol=rtol)
 
@@ -274,7 +281,11 @@ def run_command_test(test_config):
     sub_model_zoo = model_zoo.get_sub_registry("transformers_command", "transformers_command_for_causal_lm")
 
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
-        check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
+        try:
+            check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
+        except Exception as e:
+            print(f"Failed test config: {test_config}")
+            raise e
 
     clear_layout_converter()
     Randomizer.reset_index()
diff --git a/tests/test_shardformer/test_model/test_shard_gpt2.py b/tests/test_shardformer/test_model/test_shard_gpt2.py
index f9e368c0ebf3..393f7ffca7d3 100644
--- a/tests/test_shardformer/test_model/test_shard_gpt2.py
+++ b/tests/test_shardformer/test_model/test_shard_gpt2.py
@@ -100,7 +100,14 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             atol, rtol = 5e-3, 5e-3
 
         if org_model.__class__.__name__ == "GPT2Model":
-            check_output_hidden_state(org_output, sharded_output, stage_manager, atol=atol, rtol=rtol)
+            check_output_hidden_state(
+                org_output,
+                sharded_output,
+                stage_manager,
+                atol=atol,
+                rtol=rtol,
+                shard_config=booster.plugin.shard_config,
+            )
 
         check_loss(org_loss, sharded_loss, atol=atol, rtol=rtol)
 
@@ -132,14 +139,27 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
     "test_config",
     [
         {
-            "tp_size": 4,
+            "sp_size": 2,
+            "tp_size": 1,
+            "pp_size": 2,
+            "enable_sequence_parallelism": True,
+            "sequence_parallelism_mode": "ring_attn",
+            "num_microbatches": 2,
+            "enable_all_optimization": True,
+            "use_lazy_init": True,
+            "precision": "fp16",
+            "initial_scale": 1,
+        },
+        {
+            "sp_size": 2,
+            "tp_size": 2,
             "pp_size": 1,
-            "num_microbatches": 1,
             "enable_sequence_parallelism": True,
-            "sequence_parallelism_mode": "ring",
-            "enable_flash_attention": False,
+            "sequence_parallelism_mode": "ring_attn",
+            "num_microbatches": 1,
+            "enable_all_optimization": True,
             "use_lazy_init": True,
-            "precision": "fp32",
+            "precision": "fp16",
             "initial_scale": 1,
         },
         {
@@ -148,7 +168,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "num_microbatches": 1,
             "enable_sequence_parallelism": True,
             "sequence_parallelism_mode": "split_gather",
-            "enable_flash_attention": False,
+            "enable_flash_attention": True,
             "use_lazy_init": True,
             "precision": "fp16",
             "initial_scale": 1,
@@ -156,7 +176,18 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
         {
             "tp_size": 2,
             "pp_size": 2,
-            "num_microbatches": 4,
+            "num_microbatches": 2,
+            "enable_sequence_parallelism": True,
+            "sequence_parallelism_mode": "split_gather",
+            "enable_flash_attention": True,
+            "use_lazy_init": True,
+            "precision": "fp16",
+            "initial_scale": 1,
+        },
+        {
+            "tp_size": 2,
+            "pp_size": 2,
+            "num_microbatches": 2,
             "enable_all_optimization": True,
             "use_lazy_init": True,
             "precision": "fp16",
@@ -185,7 +216,16 @@ def run_gpt2_test(test_config):
         loss_fn,
         _,
     ) in sub_model_zoo.items():
-        check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
+
+        if test_config.get("sequence_parallelism_mode", None) == "ring_attn" and name != "transformers_gpt_lm":
+            # Only wrote zigzag splitting for cross entropy loss
+            continue
+
+        try:
+            check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
+        except Exception as e:
+            print(f"Failed config: {test_config} for model {name}")
+            raise (e)
 
     clear_layout_converter()
     torch.cuda.empty_cache()
@@ -226,7 +266,11 @@ def run_gpt2_3d_test(test_config):
         loss_fn,
         _,
     ) in sub_model_zoo.items():
-        check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
+        try:
+            check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
+        except Exception as e:
+            print(f"Failed config: {test_config} for model {name}")
+            raise (e)
 
     clear_layout_converter()
     torch.cuda.empty_cache()
diff --git a/tests/test_shardformer/test_model/test_shard_llama.py b/tests/test_shardformer/test_model/test_shard_llama.py
index 3c66f609787a..e8f7916972a5 100644
--- a/tests/test_shardformer/test_model/test_shard_llama.py
+++ b/tests/test_shardformer/test_model/test_shard_llama.py
@@ -165,7 +165,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "zero_stage": 0,
             "precision": "fp16",
             "initial_scale": 1,
-            "inner_ring_size": 2,
         },
         # Ring Attention + PP
         {
@@ -215,18 +214,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "sequence_parallelism_mode": "all_to_all",
             "enable_all_optimization": True,
             "use_lazy_init": True,
-            "zero_stage": 0,
-            "precision": "fp16",
-            "initial_scale": 1,
-        },
-        {
-            "tp_size": 4,
-            "pp_size": 1,
-            "num_microbatches": 1,
-            "enable_sequence_parallelism": True,
-            "sequence_parallelism_mode": "split_gather",
-            "enable_flash_attention": True,
-            "use_lazy_init": True,
+            "zero_stage": 1,
             "precision": "fp16",
             "initial_scale": 1,
         },
@@ -294,6 +282,7 @@ def run_llama_test(test_config):
         except Exception as e:
             print(f"Failed config: {test_config}, model name: {name}")
             raise e
+
     clear_layout_converter()
     Randomizer.reset_index()
     torch.cuda.empty_cache()
diff --git a/tests/test_shardformer/test_model/test_shard_qwen2.py b/tests/test_shardformer/test_model/test_shard_qwen2.py
index c87415b7562d..865563adc625 100644
--- a/tests/test_shardformer/test_model/test_shard_qwen2.py
+++ b/tests/test_shardformer/test_model/test_shard_qwen2.py
@@ -94,6 +94,32 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
 @parameterize(
     "test_config",
     [
+        {
+            "tp_size": 2,
+            "pp_size": 2,
+            "sp_size": 2,
+            "num_microbatches": 2,
+            "enable_sequence_parallelism": True,
+            "sequence_parallelism_mode": "split_gather",
+            "enable_flash_attention": True,
+            "use_lazy_init": True,
+            "zero_stage": 1,
+            "precision": "fp16",
+            "initial_scale": 1,
+        },
+        {  # Ulysess + Flash attention
+            "tp_size": 1,
+            "pp_size": 2,
+            "sp_size": 2,
+            "num_microbatches": 2,
+            "enable_sequence_parallelism": True,
+            "sequence_parallelism_mode": "all_to_all",
+            "enable_flash_attention": True,
+            "use_lazy_init": True,
+            "zero_stage": 1,
+            "precision": "fp16",
+            "initial_scale": 1,
+        },
         {
             "tp_size": 2,
             "pp_size": 2,
@@ -135,32 +161,6 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "precision": "fp16",
             "initial_scale": 1,
         },
-        {  # Ulysess + Flash attention
-            "tp_size": 1,
-            "pp_size": 2,
-            "sp_size": 2,
-            "num_microbatches": 2,
-            "enable_sequence_parallelism": True,
-            "sequence_parallelism_mode": "all_to_all",
-            "enable_flash_attention": True,
-            "use_lazy_init": True,
-            "zero_stage": 1,
-            "precision": "fp16",
-            "initial_scale": 1,
-        },
-        {
-            "tp_size": 2,
-            "pp_size": 2,
-            "sp_size": 2,
-            "num_microbatches": 2,
-            "enable_sequence_parallelism": True,
-            "sequence_parallelism_mode": "split_gather",
-            "enable_flash_attention": True,
-            "use_lazy_init": True,
-            "zero_stage": 1,
-            "precision": "fp16",
-            "initial_scale": 1,
-        },
         {
             "tp_size": 2,
             "pp_size": 2,

From c54c4fcd15b70830e2efe00df0a6087b9ce5f6b1 Mon Sep 17 00:00:00 2001
From: botbw <wang1570@e.ntu.edu.sg>
Date: Tue, 10 Sep 2024 17:30:53 +0800
Subject: [PATCH 073/167] [hotfix] moe hybrid parallelism benchmark & follow-up
 fix (#6048)

* [example] pass use_fp8_comm flag to all plugins

* [example] add mixtral benchmark

* [moe] refine assertion and check

* [moe] fix mixtral & add more tests

* [moe] consider checking dp * sp group and moe_dp_group

* [mixtral] remove gate tp & add more tests

* [deepseek] fix tp & sp for deepseek

* [mixtral] minor fix

* [deepseek] add deepseek benchmark
---
 .../plugin/moe_hybrid_parallel_plugin.py      |  35 ++-
 colossalai/moe/_operation.py                  |   7 +-
 colossalai/shardformer/modeling/deepseek.py   |  81 +++++-
 colossalai/shardformer/modeling/mixtral.py    |   4 +-
 colossalai/shardformer/policies/deepseek.py   |  36 ++-
 colossalai/shardformer/policies/mixtral.py    |  22 +-
 examples/language/deepseek/benchmark.py       | 271 ++++++++++++++++++
 examples/language/deepseek/data_utils.py      |   1 +
 examples/language/deepseek/model_utils.py     |   1 +
 .../deepseek/performance_evaluator.py         |   1 +
 examples/language/deepseek/test_ci.sh         |   0
 examples/language/llama/benchmark.py          |   6 +-
 examples/language/mixtral/benchmark.py        | 259 +++++++++++++++++
 examples/language/mixtral/data_utils.py       |   1 +
 examples/language/mixtral/model_utils.py      |   1 +
 .../language/mixtral/performance_evaluator.py |   1 +
 examples/language/mixtral/test_ci.sh          |   0
 tests/test_moe/moe_utils.py                   |  71 ++++-
 tests/test_moe/test_moe_checkpoint.py         |   4 +-
 .../test_model/test_shard_deepseek.py         | 102 +++++--
 .../test_model/test_shard_mixtral.py          | 102 +++++--
 21 files changed, 907 insertions(+), 99 deletions(-)
 create mode 100644 examples/language/deepseek/benchmark.py
 create mode 120000 examples/language/deepseek/data_utils.py
 create mode 120000 examples/language/deepseek/model_utils.py
 create mode 120000 examples/language/deepseek/performance_evaluator.py
 create mode 100755 examples/language/deepseek/test_ci.sh
 create mode 100644 examples/language/mixtral/benchmark.py
 create mode 120000 examples/language/mixtral/data_utils.py
 create mode 120000 examples/language/mixtral/model_utils.py
 create mode 120000 examples/language/mixtral/performance_evaluator.py
 create mode 100755 examples/language/mixtral/test_ci.sh

diff --git a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
index 74d35f5c5505..2324a5239d79 100644
--- a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
@@ -64,13 +64,18 @@ def __init__(
         forced_dtype: Optional[torch.dtype] = None,
         overlap_allgather: bool = False,
     ):
-        pg_param_list = {
-            dp_process_group: list(filter(lambda p: not is_moe_tensor(p), model.parameters())),
-            moe_dp_group: list(filter(is_moe_tensor, model.parameters())),
-        }
+        if dp_process_group is moe_dp_group:
+            pg_param_list = {
+                dp_process_group: list(model.parameters()),
+            }
+        else:
+            pg_param_list = {
+                dp_process_group: list(filter(lambda p: not is_moe_tensor(p), model.parameters())),
+                moe_dp_group: list(filter(is_moe_tensor, model.parameters())),
+            }
 
-        if len(pg_param_list[dp_process_group]) == 0 or len(pg_param_list[moe_dp_group]) == 0:
-            raise ValueError("No parameters found in dp_process_group or moe_dp_group")
+        if len(pg_param_list[moe_dp_group]) == 0:
+            raise ValueError("No parameters found in moe_dp_group, please consider using HybridParallelPlugin instead")
 
         super().__init__(
             model=model,
@@ -407,6 +412,13 @@ def configure(
                 and self.enable_sequence_parallelism
                 and self.sequence_parallelism_mode == "all_to_all"
             )
+
+            # sync gradients across DP * SP ranks
+            if self.enable_sequence_parallelism and self.sequence_parallelism_mode == "all_to_all":
+                dp_group = self.pg_mesh.create_group_along_axis([self.moe_dp_axis, self.ep_axis, self.sp_axis])
+            else:
+                dp_group = self.dp_group
+
             if use_ddp:
                 self.logger.warning(
                     f"Will have to check all params are used in pytorch DDP since not all experts are always activated",
@@ -414,17 +426,11 @@ def configure(
                 )
                 self.ddp_config["find_unused_parameters"] = True
 
-                if dist.get_process_group_ranks(self.dp_group) != dist.get_process_group_ranks(self.moe_dp_group):
+                if dist.get_process_group_ranks(dp_group) != dist.get_process_group_ranks(self.moe_dp_group):
                     raise ValueError(
-                        f"if pytorch ddp is used, dp_group and moe_dp_group are expected to be the same since DDP can only reduce grad across a single group, but found dp_group {dist.get_process_group_ranks(self.dp_group)} and moe_dp_group {dist.get_process_group_ranks(self.moe_dp_group)}, you might want to use HybridParallelPlugin (i.e. set ep_size = 1) or set zero_stage > 0"
+                        f"if pytorch DDP is used, dp_group and moe_dp_group are expected to be the same since DDP can only reduce grad across a single group, but found dp_group {dist.get_process_group_ranks(dp_group)} and moe_dp_group {dist.get_process_group_ranks(self.moe_dp_group)}, you might want to modify your config to bypass DDP \nhint: check the above ddp condition to by pass this"
                     )
 
-            # sync gradients across DP * SP ranks
-            if self.enable_sequence_parallelism and self.sequence_parallelism_mode == "all_to_all":
-                dp_group = self.pg_mesh.create_group_along_axis([self.moe_dp_axis, self.ep_axis, self.sp_axis])
-            else:
-                dp_group = self.dp_group
-
             model = HybridParallelModule(
                 module=model,
                 precision=self.precision,
@@ -466,6 +472,7 @@ def configure(
                         tp_process_group=self.tp_group,
                     )
             else:
+                is_zero = True
                 if self.dp_size <= 1:
                     self.logger.warning(
                         "Use Zero Optimizer when data parallel size is 1 may introduce unnecessary overhead. "
diff --git a/colossalai/moe/_operation.py b/colossalai/moe/_operation.py
index ba087a03b728..62904d90eef8 100644
--- a/colossalai/moe/_operation.py
+++ b/colossalai/moe/_operation.py
@@ -308,7 +308,7 @@ def backward(ctx: Any, *grad_outputs: Tensor) -> Tuple[Tensor, None]:
         assert len(grad_outputs) == 1
         grad = grad_outputs[0]
         if ctx.ep_size != 1:
-            grad = grad * ctx.ep_size
+            grad.mul_(ctx.ep_size)
         return grad, None
 
 
@@ -328,7 +328,7 @@ def backward(ctx: Any, *grad_outputs: Tensor) -> Tuple[Tensor, None]:
         assert len(grad_outputs) == 1
         grad = grad_outputs[0]
         if ctx.ep_size != 1:
-            grad = grad / ctx.ep_size
+            grad.div_(ctx.ep_size)
         return grad, None
 
 
@@ -449,7 +449,4 @@ def all_to_all_uneven(
     overlap: bool = False,
     fp8_communication: bool = False,
 ):
-    assert (
-        inputs.requires_grad
-    ), "Input must require grad to assure that backward is executed, otherwise it might hang the program."
     return AllToAllUneven.apply(inputs, input_split_sizes, output_split_sizes, group, overlap, fp8_communication)
diff --git a/colossalai/shardformer/modeling/deepseek.py b/colossalai/shardformer/modeling/deepseek.py
index 7ec390d6ab98..4b1b82b7c770 100644
--- a/colossalai/shardformer/modeling/deepseek.py
+++ b/colossalai/shardformer/modeling/deepseek.py
@@ -3,7 +3,7 @@
 
 import torch
 import torch.distributed as dist
-import torch.nn as nn
+import torch.functional as F
 from torch.distributed import ProcessGroup
 from torch.nn import CrossEntropyLoss
 from transformers.cache_utils import Cache, DynamicCache
@@ -28,11 +28,13 @@
 from colossalai.shardformer.layer._operation import (
     all_to_all_comm,
     gather_forward_split_backward,
+    linear_with_async_comm,
     split_forward_gather_backward,
 )
-from colossalai.shardformer.layer.linear import Linear1D_Col, Linear1D_Row
+from colossalai.shardformer.layer.linear import Linear1D_Col, Linear1D_Row, ParallelModule
 from colossalai.shardformer.shard import ShardConfig
 from colossalai.shardformer.shard.utils import set_tensors_to_none
+from colossalai.tensor.d_tensor.api import shard_rowwise, sharded_tensor_to_existing_param
 from colossalai.tensor.moe_tensor.api import set_moe_tensor_ep_group
 
 
@@ -58,7 +60,7 @@ def backward(ctx, grad_output):
         return grad_output, grad_loss
 
 
-class EPDeepseekMoE(nn.Module):
+class EPDeepseekMoE(ParallelModule):
     def __init__(self):
         raise RuntimeError(f"Please use `from_native_module` to create an instance of {self.__class__.__name__}")
 
@@ -214,6 +216,79 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return output_hidden_states
 
 
+class DeepseekMoEGate_Col(ParallelModule):
+    def parallel_linear(self, hidden_states):
+        assert (
+            hidden_states.shape[-1] == self.weight.shape[-1]
+        ), "Invalid shapes in Linear1D_Col forward: input={}, weight={}. Expected last dim of input {}.".format(
+            hidden_states.shape, self.weight.shape, self.weight.shape[-1]
+        )
+
+        output = linear_with_async_comm(
+            hidden_states, self.weight, None, self.process_group, True, fp8_communication=self.fp8_communication
+        )
+
+        # All-gather across the partitions.
+        output = gather_forward_split_backward(
+            output, dim=-1, process_group=self.process_group, fp8_communication=self.fp8_communication
+        )
+        return output
+
+    def forward(self, hidden_states):
+        bsz, seq_len, h = hidden_states.shape
+        ### compute gating score
+        hidden_states = hidden_states.view(-1, h)
+        logits = self.parallel_linear(hidden_states)
+        if self.scoring_func == "softmax":
+            scores = logits.softmax(dim=-1)
+        else:
+            raise NotImplementedError(f"insupportable scoring function for MoE gating: {self.scoring_func}")
+
+        ### select top-k experts
+        topk_weight, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=False)
+
+        ### norm gate to sum 1
+        if self.top_k > 1 and self.norm_topk_prob:
+            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
+            topk_weight = topk_weight / denominator
+
+        ### expert-level computation auxiliary loss
+        if self.training and self.alpha > 0.0:
+            scores_for_aux = scores
+            aux_topk = self.top_k
+            # always compute aux loss based on the naive greedy topk method
+            topk_idx_for_aux_loss = topk_idx.view(bsz, -1)
+            if self.seq_aux:
+                scores_for_seq_aux = scores_for_aux.view(bsz, seq_len, -1)
+                ce = torch.zeros(bsz, self.n_routed_experts, device=hidden_states.device)
+                ce.scatter_add_(
+                    1, topk_idx_for_aux_loss, torch.ones(bsz, seq_len * aux_topk, device=hidden_states.device)
+                ).div_(seq_len * aux_topk / self.n_routed_experts)
+                aux_loss = (ce * scores_for_seq_aux.mean(dim=1)).sum(dim=1).mean() * self.alpha
+            else:
+                mask_ce = F.one_hot(topk_idx_for_aux_loss.view(-1), num_classes=self.n_routed_experts)
+                ce = mask_ce.float().mean(0)
+                Pi = scores_for_aux.mean(0)
+                fi = ce * self.n_routed_experts
+                aux_loss = (Pi * fi).sum() * self.alpha
+        else:
+            aux_loss = None
+
+        return topk_idx, topk_weight, aux_loss
+
+    @staticmethod
+    def from_native_module(
+        module, process_group: ProcessGroup, config, gather_output, fp8_communication
+    ) -> "DeepseekMoEGate_Col":
+        LazyInitContext.materialize(module)
+        module.process_group = process_group
+        module.fp8_communication = fp8_communication
+        sharded_weight = shard_rowwise(module.weight.data, process_group)
+        sharded_tensor_to_existing_param(sharded_weight, module.weight)
+        module.__class__ = DeepseekMoEGate_Col
+        return module
+
+
 class DeepseekPipelineForwards:
     """
     This class serves as a micro library for forward function substitution of Llama models
diff --git a/colossalai/shardformer/modeling/mixtral.py b/colossalai/shardformer/modeling/mixtral.py
index 4850ef1b6929..0103808dc6a9 100644
--- a/colossalai/shardformer/modeling/mixtral.py
+++ b/colossalai/shardformer/modeling/mixtral.py
@@ -36,7 +36,7 @@
     gather_forward_split_backward,
     split_forward_gather_backward,
 )
-from colossalai.shardformer.layer.linear import Linear1D_Col, Linear1D_Row
+from colossalai.shardformer.layer.linear import Linear1D_Col, Linear1D_Row, ParallelModule
 from colossalai.shardformer.shard import ShardConfig
 from colossalai.shardformer.shard.utils import set_tensors_to_none
 from colossalai.tensor.moe_tensor.api import set_moe_tensor_ep_group
@@ -49,7 +49,7 @@
     _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
 
 
-class EPMixtralSparseMoeBlock(MixtralSparseMoeBlock):
+class EPMixtralSparseMoeBlock(ParallelModule):
     def __init__(self, *args, **kwargs):
         raise RuntimeError(f"Please use `from_native_module` to create an instance of {self.__class__.__name__}")
 
diff --git a/colossalai/shardformer/policies/deepseek.py b/colossalai/shardformer/policies/deepseek.py
index 0b8a602d1164..bd54e6f2db9e 100644
--- a/colossalai/shardformer/policies/deepseek.py
+++ b/colossalai/shardformer/policies/deepseek.py
@@ -10,6 +10,7 @@
 from colossalai.shardformer.layer.embedding import PaddingEmbedding, VocabParallelEmbedding1D
 from colossalai.shardformer.layer.linear import Linear1D_Row
 from colossalai.shardformer.modeling.deepseek import (
+    DeepseekMoEGate_Col,
     DeepseekPipelineForwards,
     EPDeepseekMoE,
     get_deepseek_flash_attention_forward,
@@ -56,16 +57,24 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
         sp_size = self.shard_config.sequence_parallel_size or None
         sp_group = self.shard_config.sequence_parallel_process_group or None
         sp_partial_derived = sp_mode in ["split_gather", "ring"]
+        tp_size = self.shard_config.tensor_parallel_size
+
+        # modified for both SP and TP
+        num_q_heads = self.model.config.num_attention_heads
+        num_kv_heads = getattr(self.model.config, "num_key_value_heads", None)
         if sp_mode == "all_to_all":
+            num_q_heads //= sp_size
             decoder_attribute_replacement = {
-                "num_heads": self.model.config.num_attention_heads // sp_size,
+                "num_heads": num_q_heads,
             }
             if getattr(self.model.config, "num_key_value_heads", False):
-                decoder_attribute_replacement["num_key_value_heads"] = self.model.config.num_key_value_heads // sp_size
+                num_kv_heads //= sp_size
+                decoder_attribute_replacement["num_key_value_heads"] = num_kv_heads
 
             policy[attn_cls] = ModulePolicyDescription(
                 attribute_replacement=decoder_attribute_replacement,
             )
+
         if self.shard_config.enable_sequence_parallelism:
             if self.pipeline_stage_manager is not None:
                 # NOTE: we are replacing model forward for both sequence parallelism and pipeline parallelism
@@ -97,6 +106,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
         else:
             if self.tie_weight:
                 embedding_cls = PaddingEmbedding
+
         if self.shard_config.enable_tensor_parallelism:
             # tensor parallelism for non-moe params
             assert (
@@ -107,10 +117,15 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
             ), f"The number of key_value heads must be divisible by tensor parallel size."
             decoder_attribute_replacement = {
                 "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
-                "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
-                "self_attn.num_key_value_heads": self.model.config.num_key_value_heads
-                // self.shard_config.tensor_parallel_size,
             }
+            num_q_heads //= tp_size
+            decoder_attribute_replacement = {
+                "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
+                "self_attn.num_heads": num_q_heads,
+            }
+            if num_kv_heads:
+                num_kv_heads //= tp_size
+                decoder_attribute_replacement["self_attn.num_key_value_heads"] = num_kv_heads
 
             policy["DeepseekDecoderLayer"] = ModulePolicyDescription(
                 attribute_replacement=decoder_attribute_replacement,
@@ -135,8 +150,19 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                         target_module=Linear1D_Row,
                         kwargs={"fp8_communication": self.shard_config.fp8_communication},
                     ),
+                    SubModuleReplacementDescription(
+                        suffix="mlp.gate",
+                        target_module=DeepseekMoEGate_Col,
+                        kwargs={
+                            "gather_output": True,
+                            "fp8_communication": self.shard_config.fp8_communication,
+                            "config": self.model.config,
+                        },
+                        ignore_if_not_exist=True,
+                    ),
                 ],
             )
+
         if embedding_cls is not None:
             self.append_or_create_submodule_replacement(
                 description=SubModuleReplacementDescription(
diff --git a/colossalai/shardformer/policies/mixtral.py b/colossalai/shardformer/policies/mixtral.py
index 3a373889c5c6..9f03319e7e5b 100644
--- a/colossalai/shardformer/policies/mixtral.py
+++ b/colossalai/shardformer/policies/mixtral.py
@@ -51,12 +51,20 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
         sp_size = self.shard_config.sequence_parallel_size or None
         sp_group = self.shard_config.sequence_parallel_process_group or None
         sp_partial_derived = sp_mode in ["split_gather", "ring"]
+        tp_size = self.shard_config.tensor_parallel_size
+
+        # modified for both SP and TP
+        num_q_heads = self.model.config.num_attention_heads
+        num_kv_heads = getattr(self.model.config, "num_key_value_heads", None)
+
         if sp_mode == "all_to_all":
+            num_q_heads //= sp_size
             decoder_attribute_replacement = {
-                "num_heads": self.model.config.num_attention_heads // sp_size,
+                "num_heads": num_q_heads,
             }
             if getattr(self.model.config, "num_key_value_heads", False):
-                decoder_attribute_replacement["num_key_value_heads"] = self.model.config.num_key_value_heads // sp_size
+                num_kv_heads //= sp_size
+                decoder_attribute_replacement["num_key_value_heads"] = num_kv_heads
 
             policy[attn_cls] = ModulePolicyDescription(
                 attribute_replacement=decoder_attribute_replacement,
@@ -101,12 +109,14 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
             assert (
                 self.model.config.num_key_value_heads % self.shard_config.tensor_parallel_size == 0
             ), f"The number of key_value heads must be divisible by tensor parallel size."
+            num_q_heads //= tp_size
             decoder_attribute_replacement = {
                 "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
-                "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
-                "self_attn.num_key_value_heads": self.model.config.num_key_value_heads
-                // self.shard_config.tensor_parallel_size,
+                "self_attn.num_heads": num_q_heads,
             }
+            if num_kv_heads:
+                num_kv_heads //= tp_size
+                decoder_attribute_replacement["self_attn.num_key_value_heads"] = num_kv_heads
 
             policy[MixtralDecoderLayer] = ModulePolicyDescription(
                 attribute_replacement=decoder_attribute_replacement,
@@ -131,7 +141,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                         target_module=Linear1D_Row,
                         kwargs={"fp8_communication": self.shard_config.fp8_communication},
                     ),
-                    SubModuleReplacementDescription(  # or replicate?
+                    SubModuleReplacementDescription(
                         suffix="block_sparse_moe.gate",
                         target_module=Linear1D_Col,
                         kwargs={"gather_output": True, "fp8_communication": self.shard_config.fp8_communication},
diff --git a/examples/language/deepseek/benchmark.py b/examples/language/deepseek/benchmark.py
new file mode 100644
index 000000000000..fef181e71211
--- /dev/null
+++ b/examples/language/deepseek/benchmark.py
@@ -0,0 +1,271 @@
+# modified from mixtral benchmark
+import argparse
+import resource
+import time
+import warnings
+from contextlib import nullcontext
+
+import torch
+import torch.distributed as dist
+from data_utils import RandomDataset
+from model_utils import format_numel_str, get_model_numel
+from performance_evaluator import PerformanceEvaluator, get_profile_context
+from tqdm import tqdm
+from transformers import AutoConfig, AutoModelForCausalLM
+
+import colossalai
+from colossalai.accelerator import get_accelerator
+from colossalai.booster import Booster
+from colossalai.booster.plugin import MoeHybridParallelPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.lazy import LazyInitContext
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.shardformer import PipelineGradientCheckpointConfig
+
+warnings.filterwarnings("ignore")
+# ==============================
+# Constants
+# ==============================
+
+# We have lots of llamas for your choice!
+MODEL_CONFIGS = {
+    "100m": lambda: AutoConfig.from_pretrained(
+        "deepseek-ai/deepseek-moe-16b-base",
+        max_position_embeddings=4096,
+        num_hidden_layers=1,
+        num_attention_heads=32,
+        intermediate_size=512,
+        moe_intermediate_size=128,
+        hidden_size=512,
+        n_routed_experts=8,
+        n_shared_experts=4,
+        num_experts_per_tok=2,
+        first_k_dense_replace=0,
+        attn_implementation="flash_attention_2",
+        trust_remote_code=True,
+    ),
+    "7b": lambda: AutoConfig.from_pretrained(
+        "deepseek-ai/deepseek-moe-16b-base",
+        max_position_embeddings=4096,
+        num_hidden_layers=13,
+        attn_implementation="flash_attention_2",
+        trust_remote_code=True,
+    ),
+    "14b": lambda: AutoConfig.from_pretrained(
+        "deepseek-ai/deepseek-moe-16b-base",
+        max_position_embeddings=4096,
+        num_hidden_layers=26,
+        attn_implementation="flash_attention_2",
+        trust_remote_code=True,
+    ),
+}
+
+
+def main():
+    # ==============================
+    # Parse Arguments
+    # ==============================
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-c", "--config", type=str, default="100m", help="Model configuration")
+    parser.add_argument(
+        "-p",
+        "--plugin",
+        choices=["3d"],
+        default="3d",
+        help="Choose which plugin to use",
+    )
+    parser.add_argument("-b", "--batch_size", type=int, default=1, help="Batch size")
+    parser.add_argument("-s", "--num_steps", type=int, default=5, help="Number of steps to run")
+    parser.add_argument("-i", "--ignore_steps", type=int, default=2, help="Number of steps to ignore")
+    parser.add_argument("-g", "--grad_checkpoint", action="store_true", help="Use gradient checkpointing")
+    parser.add_argument("-l", "--max_length", type=int, default=4096, help="Max sequence length")
+    parser.add_argument(
+        "-w", "--warmup_ratio", type=float, default=0.8, help="warm up ratio of non-model data. Only for gemini-auto"
+    )
+    parser.add_argument("-m", "--memory_limit", type=int, help="Gemini memory limit in mb")
+    parser.add_argument("-x", "--xformers", action="store_true", help="Use xformers")
+    parser.add_argument("--shard_param_frac", type=float, default=1.0, help="Shard param fraction. Only for gemini")
+    parser.add_argument("--offload_optim_frac", type=float, default=0.0, help="Offload optim fraction. Only for gemini")
+    parser.add_argument("--offload_param_frac", type=float, default=0.0, help="Offload param fraction. Only for gemini")
+    parser.add_argument("--tp", type=int, default=1, help="Tensor parallel size")
+    parser.add_argument("--ep", type=int, default=1, help="Expert parallel size")
+    parser.add_argument("--sp", type=int, default=1, help="Sequence parallel size")
+    parser.add_argument("--extra_dp", type=int, default=1, help="Extra data parallel size, used for Gemini")
+    parser.add_argument("--pp", type=int, default=1, help="Pipeline parallel size")
+    parser.add_argument("--mbs", type=int, default=1, help="Micro batch size of pipeline parallel")
+    parser.add_argument("--zero", type=int, default=1, help="Zero Stage when hybrid plugin is enabled")
+    parser.add_argument("--custom-ckpt", action="store_true", help="Customize checkpoint", default=False)
+
+    parser.add_argument("--pp_style", default="1f1b", choices=["1f1b", "interleaved"])
+    parser.add_argument("--n_chunks", default=1, help="number of model chunks", type=eval)
+    parser.add_argument("--profile", action="store_true", help="Profile the code")
+    parser.add_argument(
+        "--nsys",
+        action="store_true",
+        help="Use nsys for profiling. \
+        You should put something like this before colossalai launch: \
+        nsys profile -w true -t cuda,cudnn,cublas -s cpu --capture-range=cudaProfilerApi --capture-range-end=stop --cudabacktrace=true -x true --python-backtrace=cuda -o prof_out",
+    )
+    parser.add_argument("--disable-async-reduce", action="store_true", help="Disable the asynchronous reduce operation")
+    parser.add_argument("--prefetch_num", type=int, default=0, help="chunk prefetch max number")
+    parser.add_argument("--no_cache", action="store_true")
+    parser.add_argument("--use_fp8_comm", action="store_true", default=False, help="for using fp8 during communication")
+    parser.add_argument("--use_fp8", action="store_true", default=False, help="for using fp8 linear")
+    parser.add_argument("--overlap_allgather", action="store_true")
+    parser.add_argument(
+        "--sp_mode",
+        default="all_to_all",
+        choices=["all_to_all"],
+        help="Sequence parallelism mode",
+    )
+    parser.add_argument("--debug", action="store_true", help="Enable debug mode")
+    args = parser.parse_args()
+
+    colossalai.launch_from_torch()
+    coordinator = DistCoordinator()
+
+    # ckpt config for LLaMA3-70B on 64 H100 GPUs
+    hybrid_kwargs = (
+        {
+            "gradient_checkpoint_config": PipelineGradientCheckpointConfig(
+                num_ckpt_layers_per_stage=[19, 19, 19, 13],
+            ),
+            "num_layers_per_stage": [19, 20, 20, 21],
+            "pp_style": "interleaved",
+        }
+        if args.custom_ckpt
+        else {}
+    )
+
+    # ==============================
+    # Initialize Booster
+    # ==============================
+    if args.plugin == "3d":
+        plugin = MoeHybridParallelPlugin(
+            ep_size=args.ep,
+            tp_size=args.tp,
+            pp_size=args.pp,
+            pp_style=args.pp_style,
+            num_model_chunks=args.n_chunks,
+            zero_stage=args.zero,
+            sp_size=args.sp,
+            sequence_parallelism_mode=args.sp_mode,
+            enable_sequence_parallelism=args.sp > 1,
+            enable_fused_normalization=torch.cuda.is_available(),
+            enable_flash_attention=args.xformers,
+            microbatch_size=args.mbs,
+            precision="bf16",
+            enable_metadata_cache=not args.no_cache,
+            overlap_allgather=args.overlap_allgather,
+            use_fp8=args.use_fp8,
+            fp8_communication=args.use_fp8_comm,
+            **hybrid_kwargs,
+        )
+    else:
+        raise ValueError(f"Unknown plugin {args.plugin}")
+
+    booster = Booster(plugin=plugin)
+
+    # ==============================
+    # Initialize Dataset and Dataloader
+    # ==============================
+    dp_size = getattr(plugin, "dp_size", coordinator.world_size)
+
+    config = MODEL_CONFIGS[args.config]()
+
+    torch.cuda.manual_seed(42)
+
+    dataset = RandomDataset(
+        num_samples=args.batch_size * args.num_steps * dp_size, max_length=args.max_length, vocab_size=config.vocab_size
+    )
+    dataloader = plugin.prepare_dataloader(dataset, batch_size=args.batch_size, shuffle=True, drop_last=True, seed=42)
+
+    # ==============================
+    # Initialize Model and Optimizer
+    # ==============================
+    init_ctx = (
+        LazyInitContext(default_device=get_accelerator().get_current_device())
+        if isinstance(plugin, MoeHybridParallelPlugin)
+        else nullcontext()
+    )
+
+    with init_ctx:
+        model = AutoModelForCausalLM.from_config(config, trust_remote_code=True).to(torch.bfloat16)
+
+    if args.grad_checkpoint:
+        model.gradient_checkpointing_enable()
+
+    model_numel = get_model_numel(model)
+    coordinator.print_on_master(f"Model params: {format_numel_str(model_numel)}")
+    performance_evaluator = PerformanceEvaluator(
+        model_numel,
+        model.config.num_hidden_layers,
+        model.config.hidden_size,
+        model.config.vocab_size,
+        args.grad_checkpoint,
+        args.ignore_steps,
+        dp_world_size=dp_size,
+    )
+
+    optimizer = HybridAdam(model.parameters())
+    torch.set_default_dtype(torch.bfloat16)
+    model, optimizer, _, dataloader, _ = booster.boost(model, optimizer, dataloader=dataloader)
+
+    torch.set_default_dtype(torch.float)
+    coordinator.print_on_master(
+        f"Booster init max CUDA memory: {get_accelerator().max_memory_allocated()/1024**2:.2f} MB"
+    )
+    coordinator.print_on_master(
+        f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024:.2f} MB"
+    )
+
+    with get_profile_context(
+        args.profile,
+        args.ignore_steps,
+        1,  # avoid creating massive log files
+        save_dir=f"profile/{time.strftime('%H:%M', time.localtime())}-{args.plugin}-llama-{args.config}",
+        nsys=args.nsys,
+    ) as prof:  # , distributed_debug_mode(10, enable=True):
+        if isinstance(plugin, MoeHybridParallelPlugin) and args.pp > 1:
+            data_iter = iter(dataloader)
+            for step in tqdm(range(len(dataloader)), desc="Step", disable=not coordinator.is_master()):
+                performance_evaluator.on_step_start(step)
+                outputs = booster.execute_pipeline(
+                    data_iter,
+                    model,
+                    criterion=lambda outputs, inputs: outputs[0],
+                    optimizer=optimizer,
+                    return_loss=True,
+                )
+                loss = outputs["loss"]
+                if dist.get_rank() == dist.get_world_size() - 1:
+                    print(f"Step {step} loss: {loss}")
+                optimizer.step()
+                optimizer.zero_grad()
+
+                performance_evaluator.on_step_end(input_ids=torch.empty(args.batch_size, args.max_length))
+                prof.step()
+                print(f"rank {dist.get_rank()} step {step} passed")
+        else:
+            for step, batch in enumerate(tqdm(dataloader, desc="Step", disable=not coordinator.is_master())):
+                performance_evaluator.on_step_start(step)
+                outputs = model(**batch)
+                loss = outputs[0]
+                del outputs  # free memory
+
+                if dist.get_rank() == dist.get_world_size() - 1:
+                    print(f"Step {step} loss: {loss}")
+
+                booster.backward(loss, optimizer)
+                optimizer.step()
+                optimizer.zero_grad()
+
+                performance_evaluator.on_step_end(**batch)
+                prof.step()
+
+    performance_evaluator.on_fit_end()
+    coordinator.print_on_master(f"Max CUDA memory usage: {get_accelerator().max_memory_allocated()/1024**2:.2f} MB")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/language/deepseek/data_utils.py b/examples/language/deepseek/data_utils.py
new file mode 120000
index 000000000000..2da9822dfc57
--- /dev/null
+++ b/examples/language/deepseek/data_utils.py
@@ -0,0 +1 @@
+../data_utils.py
\ No newline at end of file
diff --git a/examples/language/deepseek/model_utils.py b/examples/language/deepseek/model_utils.py
new file mode 120000
index 000000000000..73c6818a8c8f
--- /dev/null
+++ b/examples/language/deepseek/model_utils.py
@@ -0,0 +1 @@
+../model_utils.py
\ No newline at end of file
diff --git a/examples/language/deepseek/performance_evaluator.py b/examples/language/deepseek/performance_evaluator.py
new file mode 120000
index 000000000000..f4736354b1f3
--- /dev/null
+++ b/examples/language/deepseek/performance_evaluator.py
@@ -0,0 +1 @@
+../performance_evaluator.py
\ No newline at end of file
diff --git a/examples/language/deepseek/test_ci.sh b/examples/language/deepseek/test_ci.sh
new file mode 100755
index 000000000000..e69de29bb2d1
diff --git a/examples/language/llama/benchmark.py b/examples/language/llama/benchmark.py
index bb14378ad571..0e88fabf1eb0 100644
--- a/examples/language/llama/benchmark.py
+++ b/examples/language/llama/benchmark.py
@@ -105,7 +105,7 @@ def main():
     parser.add_argument("--prefetch_num", type=int, default=0, help="chunk prefetch max number")
     parser.add_argument("--no_cache", action="store_true")
     parser.add_argument("--use_fp8_comm", action="store_true", default=False, help="for using fp8 during communication")
-    parser.add_argument("--use_fp8", action="store_true")
+    parser.add_argument("--use_fp8", action="store_true", default=False, help="for using fp8 linear")
     parser.add_argument("--overlap_allgather", action="store_true")
     parser.add_argument(
         "--sp_mode",
@@ -151,6 +151,7 @@ def empty_init():
             max_prefetch=args.prefetch_num,
             enable_async_reduce=not args.disable_async_reduce,
             use_fp8=args.use_fp8,
+            fp8_communication=args.use_fp8_comm,
         )
     elif args.plugin == "gemini_auto":
         plugin = GeminiPlugin(
@@ -164,6 +165,7 @@ def empty_init():
             enable_async_reduce=not args.disable_async_reduce,
             enable_flash_attention=args.xformers,
             use_fp8=args.use_fp8,
+            fp8_communication=args.use_fp8_comm,
         )
     elif args.plugin == "fsdp":
         if use_empty_init:
@@ -224,6 +226,7 @@ def empty_init():
             enable_metadata_cache=not args.no_cache,
             overlap_allgather=args.overlap_allgather,
             use_fp8=args.use_fp8,
+            fp8_communication=args.use_fp8_comm,
             **hybrid_kwargs,
         )
     elif args.plugin == "3d_cpu":
@@ -241,6 +244,7 @@ def empty_init():
             precision="bf16",
             overlap_p2p=args.overlap,
             use_fp8=args.use_fp8,
+            fp8_communication=args.use_fp8_comm,
         )
     else:
         raise ValueError(f"Unknown plugin {args.plugin}")
diff --git a/examples/language/mixtral/benchmark.py b/examples/language/mixtral/benchmark.py
new file mode 100644
index 000000000000..bb2a32d013f5
--- /dev/null
+++ b/examples/language/mixtral/benchmark.py
@@ -0,0 +1,259 @@
+# modified from llama benchmark
+import argparse
+import resource
+import time
+import warnings
+from contextlib import nullcontext
+
+import torch
+import torch.distributed as dist
+from data_utils import RandomDataset
+from model_utils import format_numel_str, get_model_numel
+from performance_evaluator import PerformanceEvaluator, get_profile_context
+from tqdm import tqdm
+from transformers.models.mixtral import MixtralConfig, MixtralForCausalLM
+
+import colossalai
+from colossalai.accelerator import get_accelerator
+from colossalai.booster import Booster
+from colossalai.booster.plugin import MoeHybridParallelPlugin
+from colossalai.cluster import DistCoordinator
+from colossalai.lazy import LazyInitContext
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.shardformer import PipelineGradientCheckpointConfig
+
+warnings.filterwarnings("ignore")
+# ==============================
+# Constants
+# ==============================
+
+# We have lots of llamas for your choice!
+MODEL_CONFIGS = {
+    "100m": MixtralConfig(
+        max_position_embeddings=4096,
+        num_hidden_layers=4,
+        num_attention_heads=32,
+        intermediate_size=768,
+        hidden_size=768,
+        attn_implementation="flash_attention_2",
+    ),
+    "7b": MixtralConfig(
+        max_position_embeddings=4096,
+        num_hidden_layers=5,
+        attn_implementation="flash_attention_2",
+    ),
+    "14b": MixtralConfig(
+        max_position_embeddings=4096,
+        num_hidden_layers=10,
+        attn_implementation="flash_attention_2",
+    ),
+}
+
+
+def main():
+    # ==============================
+    # Parse Arguments
+    # ==============================
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-c", "--config", type=str, default="100m", help="Model configuration")
+    parser.add_argument(
+        "-p",
+        "--plugin",
+        choices=["3d"],
+        default="3d",
+        help="Choose which plugin to use",
+    )
+    parser.add_argument("-b", "--batch_size", type=int, default=1, help="Batch size")
+    parser.add_argument("-s", "--num_steps", type=int, default=5, help="Number of steps to run")
+    parser.add_argument("-i", "--ignore_steps", type=int, default=2, help="Number of steps to ignore")
+    parser.add_argument("-g", "--grad_checkpoint", action="store_true", help="Use gradient checkpointing")
+    parser.add_argument("-l", "--max_length", type=int, default=4096, help="Max sequence length")
+    parser.add_argument(
+        "-w", "--warmup_ratio", type=float, default=0.8, help="warm up ratio of non-model data. Only for gemini-auto"
+    )
+    parser.add_argument("-m", "--memory_limit", type=int, help="Gemini memory limit in mb")
+    parser.add_argument("-x", "--xformers", action="store_true", help="Use xformers")
+    parser.add_argument("--shard_param_frac", type=float, default=1.0, help="Shard param fraction. Only for gemini")
+    parser.add_argument("--offload_optim_frac", type=float, default=0.0, help="Offload optim fraction. Only for gemini")
+    parser.add_argument("--offload_param_frac", type=float, default=0.0, help="Offload param fraction. Only for gemini")
+    parser.add_argument("--tp", type=int, default=1, help="Tensor parallel size")
+    parser.add_argument("--ep", type=int, default=1, help="Expert parallel size")
+    parser.add_argument("--sp", type=int, default=1, help="Sequence parallel size")
+    parser.add_argument("--extra_dp", type=int, default=1, help="Extra data parallel size, used for Gemini")
+    parser.add_argument("--pp", type=int, default=1, help="Pipeline parallel size")
+    parser.add_argument("--mbs", type=int, default=1, help="Micro batch size of pipeline parallel")
+    parser.add_argument("--zero", type=int, default=1, help="Zero Stage when hybrid plugin is enabled")
+    parser.add_argument("--custom-ckpt", action="store_true", help="Customize checkpoint", default=False)
+
+    parser.add_argument("--pp_style", default="1f1b", choices=["1f1b", "interleaved"])
+    parser.add_argument("--n_chunks", default=1, help="number of model chunks", type=eval)
+    parser.add_argument("--profile", action="store_true", help="Profile the code")
+    parser.add_argument(
+        "--nsys",
+        action="store_true",
+        help="Use nsys for profiling. \
+        You should put something like this before colossalai launch: \
+        nsys profile -w true -t cuda,cudnn,cublas -s cpu --capture-range=cudaProfilerApi --capture-range-end=stop --cudabacktrace=true -x true --python-backtrace=cuda -o prof_out",
+    )
+    parser.add_argument("--disable-async-reduce", action="store_true", help="Disable the asynchronous reduce operation")
+    parser.add_argument("--prefetch_num", type=int, default=0, help="chunk prefetch max number")
+    parser.add_argument("--no_cache", action="store_true")
+    parser.add_argument("--use_fp8_comm", action="store_true", default=False, help="for using fp8 during communication")
+    parser.add_argument("--use_fp8", action="store_true", default=False, help="for using fp8 linear")
+    parser.add_argument("--overlap_allgather", action="store_true")
+    parser.add_argument(
+        "--sp_mode",
+        default="all_to_all",
+        choices=["all_to_all"],
+        help="Sequence parallelism mode",
+    )
+    parser.add_argument("--debug", action="store_true", help="Enable debug mode")
+    args = parser.parse_args()
+
+    colossalai.launch_from_torch()
+    coordinator = DistCoordinator()
+
+    # ckpt config for LLaMA3-70B on 64 H100 GPUs
+    hybrid_kwargs = (
+        {
+            "gradient_checkpoint_config": PipelineGradientCheckpointConfig(
+                num_ckpt_layers_per_stage=[19, 19, 19, 13],
+            ),
+            "num_layers_per_stage": [19, 20, 20, 21],
+            "pp_style": "interleaved",
+        }
+        if args.custom_ckpt
+        else {}
+    )
+
+    # ==============================
+    # Initialize Booster
+    # ==============================
+    if args.plugin == "3d":
+        plugin = MoeHybridParallelPlugin(
+            ep_size=args.ep,
+            tp_size=args.tp,
+            pp_size=args.pp,
+            pp_style=args.pp_style,
+            num_model_chunks=args.n_chunks,
+            zero_stage=args.zero,
+            sp_size=args.sp,
+            sequence_parallelism_mode=args.sp_mode,
+            enable_sequence_parallelism=args.sp > 1,
+            enable_fused_normalization=torch.cuda.is_available(),
+            enable_flash_attention=args.xformers,
+            microbatch_size=args.mbs,
+            precision="bf16",
+            enable_metadata_cache=not args.no_cache,
+            overlap_allgather=args.overlap_allgather,
+            use_fp8=args.use_fp8,
+            fp8_communication=args.use_fp8_comm,
+            **hybrid_kwargs,
+        )
+    else:
+        raise ValueError(f"Unknown plugin {args.plugin}")
+
+    booster = Booster(plugin=plugin)
+
+    # ==============================
+    # Initialize Dataset and Dataloader
+    # ==============================
+    dp_size = getattr(plugin, "dp_size", coordinator.world_size)
+
+    if args.config in MODEL_CONFIGS:
+        config = MODEL_CONFIGS[args.config]
+    else:
+        config = MixtralConfig.from_pretrained(args.config, trust_remote_code=True)
+    torch.cuda.manual_seed(42)
+
+    dataset = RandomDataset(
+        num_samples=args.batch_size * args.num_steps * dp_size, max_length=args.max_length, vocab_size=config.vocab_size
+    )
+    dataloader = plugin.prepare_dataloader(dataset, batch_size=args.batch_size, shuffle=True, drop_last=True, seed=42)
+
+    # ==============================
+    # Initialize Model and Optimizer
+    # ==============================
+    init_ctx = (
+        LazyInitContext(default_device=get_accelerator().get_current_device())
+        if isinstance(plugin, MoeHybridParallelPlugin)
+        else nullcontext()
+    )
+
+    with init_ctx:
+        model = MixtralForCausalLM(config=config).to(torch.bfloat16)
+
+    if args.grad_checkpoint:
+        model.gradient_checkpointing_enable()
+
+    model_numel = get_model_numel(model)
+    coordinator.print_on_master(f"Model params: {format_numel_str(model_numel)}")
+    performance_evaluator = PerformanceEvaluator(
+        model_numel,
+        model.config.num_hidden_layers,
+        model.config.hidden_size,
+        model.config.vocab_size,
+        args.grad_checkpoint,
+        args.ignore_steps,
+        dp_world_size=dp_size,
+    )
+
+    optimizer = HybridAdam(model.parameters())
+    torch.set_default_dtype(torch.bfloat16)
+    model, optimizer, _, dataloader, _ = booster.boost(model, optimizer, dataloader=dataloader)
+
+    torch.set_default_dtype(torch.float)
+    coordinator.print_on_master(
+        f"Booster init max CUDA memory: {get_accelerator().max_memory_allocated()/1024**2:.2f} MB"
+    )
+    coordinator.print_on_master(
+        f"Booster init max CPU memory: {resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024:.2f} MB"
+    )
+
+    with get_profile_context(
+        args.profile,
+        args.ignore_steps,
+        1,  # avoid creating massive log files
+        save_dir=f"profile/{time.strftime('%H:%M', time.localtime())}-{args.plugin}-llama-{args.config}",
+        nsys=args.nsys,
+    ) as prof:
+        if isinstance(plugin, MoeHybridParallelPlugin) and args.pp > 1:
+            data_iter = iter(dataloader)
+            for step in tqdm(range(len(dataloader)), desc="Step", disable=not coordinator.is_master()):
+                performance_evaluator.on_step_start(step)
+                outputs = booster.execute_pipeline(
+                    data_iter,
+                    model,
+                    criterion=lambda outputs, inputs: outputs[0],
+                    optimizer=optimizer,
+                    return_loss=True,
+                )
+                loss = outputs["loss"]
+                if dist.get_rank() == dist.get_world_size() - 1:
+                    print(f"Step {step} loss: {loss}")
+                optimizer.step()
+                optimizer.zero_grad()
+
+                performance_evaluator.on_step_end(input_ids=torch.empty(args.batch_size, args.max_length))
+                prof.step()
+        else:
+            for step, batch in enumerate(tqdm(dataloader, desc="Step", disable=not coordinator.is_master())):
+                performance_evaluator.on_step_start(step)
+                outputs = model(**batch)
+                loss = outputs[0]
+                del outputs  # free memory
+
+                if dist.get_rank() == dist.get_world_size() - 1:
+                    print(f"Step {step} loss: {loss}")
+                booster.backward(loss, optimizer)
+                optimizer.step()
+                optimizer.zero_grad()
+
+                performance_evaluator.on_step_end(**batch)
+                prof.step()
+    performance_evaluator.on_fit_end()
+    coordinator.print_on_master(f"Max CUDA memory usage: {get_accelerator().max_memory_allocated()/1024**2:.2f} MB")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/language/mixtral/data_utils.py b/examples/language/mixtral/data_utils.py
new file mode 120000
index 000000000000..2da9822dfc57
--- /dev/null
+++ b/examples/language/mixtral/data_utils.py
@@ -0,0 +1 @@
+../data_utils.py
\ No newline at end of file
diff --git a/examples/language/mixtral/model_utils.py b/examples/language/mixtral/model_utils.py
new file mode 120000
index 000000000000..73c6818a8c8f
--- /dev/null
+++ b/examples/language/mixtral/model_utils.py
@@ -0,0 +1 @@
+../model_utils.py
\ No newline at end of file
diff --git a/examples/language/mixtral/performance_evaluator.py b/examples/language/mixtral/performance_evaluator.py
new file mode 120000
index 000000000000..f4736354b1f3
--- /dev/null
+++ b/examples/language/mixtral/performance_evaluator.py
@@ -0,0 +1 @@
+../performance_evaluator.py
\ No newline at end of file
diff --git a/examples/language/mixtral/test_ci.sh b/examples/language/mixtral/test_ci.sh
new file mode 100755
index 000000000000..e69de29bb2d1
diff --git a/tests/test_moe/moe_utils.py b/tests/test_moe/moe_utils.py
index 8c411a33fef6..dbcd28ab5939 100644
--- a/tests/test_moe/moe_utils.py
+++ b/tests/test_moe/moe_utils.py
@@ -1,4 +1,12 @@
+import os
+import traceback
+from contextlib import contextmanager
+from time import sleep
+from typing import Callable, List, Optional
+
 import torch
+import torch.distributed as dist
+from torch.utils._pytree import tree_map
 
 
 def assert_loose_close(a, b, dtype: torch.dtype = torch.float32, name=""):
@@ -25,7 +33,66 @@ def loose_close(a, b, dtype: torch.dtype = torch.float32):
     return torch.allclose(a, b, rtol=rtol, atol=atol)
 
 
-def check_model_equal(model1, model2):
+def check_model_equal(model1, model2, dtype):
     assert set(model1.state_dict().keys()) == set(model2.state_dict().keys())
     for i, ((name, p1), p2) in enumerate(zip(model1.named_parameters(), model2.parameters())):
-        assert_loose_close(p1, p2, p1.dtype)
+        assert_loose_close(p1, p2, dtype, name=name)
+
+
+@contextmanager
+def distributed_debug_mode(num_stacks: int = 1, funcs_to_patch: Optional[List[Callable]] = None, enable=True):
+    if enable:
+        assert (
+            os.environ.get("CUDA_LAUNCH_BLOCKING", "0") == "1"
+        ), f"Expect CUDA_LAUNCH_BLOCKING=1, got {os.environ.get('CUDA_LAUNCH_BLOCKING', '0')}"
+    if funcs_to_patch is None:
+        funcs_to_patch = [
+            dist.all_reduce,
+            dist.all_reduce_coalesced,
+            dist.all_gather,
+            dist.all_gather_coalesced,
+            dist.all_gather_into_tensor,
+            dist.all_to_all,
+            dist.all_to_all_single,
+            dist.reduce_scatter,
+        ]
+
+    original_funcs = {}
+    patched_funcs = {}
+
+    def make_patched(func):
+        def patched_func(*args, **kwargs):
+            stack = traceback.format_stack()
+
+            def format_node(node):
+                if isinstance(node, torch.Tensor):
+                    return f"{node.shape}"
+                elif isinstance(node, list):
+                    return f"[{', '.join([format_node(n) for n in node])}]"
+
+                return str(node)
+
+            args_str, kwargs_str = tree_map(format_node, (args, kwargs))
+            en = len(stack) - 1
+            st = max(0, en - num_stacks)
+            dist.barrier()
+            sleep(0.001 * dist.get_rank())
+            print(
+                f"[Rank {dist.get_rank()}-{func.__name__}-{dist.get_process_group_ranks(kwargs.get('group', dist.group.WORLD))}]: Called from {''.join(stack[st:en])}args={args_str} kwargs={kwargs_str}\n"
+            )
+            dist.barrier()
+            return func(*args, **kwargs)
+
+        return patched_func
+
+    if enable:
+        for func in funcs_to_patch:
+            original_funcs[func.__name__] = getattr(dist, func.__name__)
+            patched_funcs[func.__name__] = make_patched(func)
+            setattr(dist, func.__name__, patched_funcs[func.__name__])
+
+    try:
+        yield
+    finally:
+        for func_name, original_func in original_funcs.items():
+            setattr(dist, func_name, original_func)
diff --git a/tests/test_moe/test_moe_checkpoint.py b/tests/test_moe/test_moe_checkpoint.py
index 89f5d1c64d0d..f3f109192756 100644
--- a/tests/test_moe/test_moe_checkpoint.py
+++ b/tests/test_moe/test_moe_checkpoint.py
@@ -130,7 +130,7 @@ def check_moe_checkpoint(test_config):
         dist.barrier()
         if dist.get_rank() == 0:
             saved_model = model_cls.from_pretrained(model_dir).cuda().to(dtype)
-            check_model_equal(orig_model, saved_model)
+            check_model_equal(orig_model, saved_model, dtype=dtype)
             saved_model.save_pretrained(hf_model_dir)
         dist.barrier()
         # check load model
@@ -138,7 +138,7 @@ def check_moe_checkpoint(test_config):
         new_optimizer = Adam(new_model.parameters(), lr=1e-3)
         new_model, new_optimizer, *_ = booster.boost(model=new_model, optimizer=new_optimizer)
         booster.load_model(new_model, hf_model_dir)
-        check_model_equal(model, new_model)
+        check_model_equal(model, new_model, dtype=dtype)
 
         # check save optimizer
         optimizer.step()
diff --git a/tests/test_shardformer/test_model/test_shard_deepseek.py b/tests/test_shardformer/test_model/test_shard_deepseek.py
index 46da4522fd9d..d782a2a09604 100644
--- a/tests/test_shardformer/test_model/test_shard_deepseek.py
+++ b/tests/test_shardformer/test_model/test_shard_deepseek.py
@@ -12,43 +12,25 @@
 import colossalai
 from colossalai.booster.booster import Booster
 from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
+from colossalai.shardformer.layer.utils import Randomizer
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
 from colossalai.testing.random import seed_all
 from tests.test_moe.moe_utils import assert_loose_close, check_model_equal
 
 NUM_BATCH = 8
-NUM_TOK_PER_BATCH, NUM_EXPERTS = 4, 2
+NUM_TOK_PER_BATCH, NUM_EXPERTS = 64, 4
 NUM_LAYERS = 4
 HIDDEN_SIZE_PER_HEAD = 4
-NUM_HEADS = 4
+NUM_HEADS = 8
 TOP_K = 2
 
 
-CHECKED_CONFIG = [  # FOR_WORLD=4
-    (1, 4, 1, 1, 1),
-    (1, 1, 4, 1, 1),
-    (1, 1, 1, 4, 1),
-    (1, 1, 1, 1, 4),
-    (0, 1, 4, 1, 1),
-    (0, 1, 1, 4, 1),
-    (0, 1, 1, 1, 4),
-    (1, 2, 1, 1, 1),
-]
-
-
-@parameterize(
-    "config",
-    [
-        (1, 2, 2, 1, 1),
-        (1, 2, 1, 2, 1),
-        (1, 2, 1, 1, 2),
-    ],
-)
-def run_zero_with_original_model(config: Tuple[int, ...]):
+def run_deepseek_commom(config: Tuple[int, ...]):
+    Randomizer.reset_index()
     stage, ep_size, pp_size, tp_size, sp_size = config
     world_size = dist.get_world_size()
     rank = dist.get_rank()
-    dtype, precision = torch.float16, "fp16"
+    dtype, precision = torch.bfloat16, "bf16"
     torch.cuda.set_device(dist.get_rank())
 
     plugin = MoeHybridParallelPlugin(
@@ -60,11 +42,11 @@ def run_zero_with_original_model(config: Tuple[int, ...]):
         zero_stage=stage,
         enable_sequence_parallelism=sp_size > 1,
         sequence_parallelism_mode="all_to_all" if sp_size > 1 else None,
-        enable_flash_attention=sp_size > 1,
         overlap_communication=False,
         initial_scale=1,
         precision=precision,
         find_unused_parameters=True,
+        enable_flash_attention=True,
     )
     dp_size = plugin.dp_size
 
@@ -171,7 +153,7 @@ def run_zero_with_original_model(config: Tuple[int, ...]):
     dist.barrier()
 
     saved_model = AutoModel.from_pretrained(model_dir, trust_remote_code=True).cuda()
-    check_model_equal(torch_model, saved_model)
+    check_model_equal(torch_model, saved_model, dtype=dtype)
     dist.barrier()
 
     if rank == world_size - 1:
@@ -180,17 +162,77 @@ def run_zero_with_original_model(config: Tuple[int, ...]):
     print(f"rank {dist.get_rank()} test passed")
 
 
-def run_dist(rank, world_size, port):
+@parameterize(
+    "config",
+    [
+        # DDP: ep == 1 since ep * moe_dp == dp == moe_dp; sp == 1 since sp * dp == moe_dp == dp
+        (0, 1, 4, 1, 1),
+        (0, 1, 1, 4, 1),
+        (0, 1, 2, 2, 1),
+        # zero 1
+        (1, 4, 1, 1, 1),
+        (1, 1, 4, 1, 1),
+        (1, 1, 1, 4, 1),
+        (1, 2, 1, 1, 2),
+        # zero 2
+        (2, 4, 1, 1, 1),
+        (2, 1, 4, 1, 1),
+        (2, 1, 1, 4, 1),
+        (2, 2, 1, 1, 2),
+    ],
+)
+def run_deepseek_test(config: Tuple[int, ...]):
+    run_deepseek_commom(config)
+
+
+@parameterize(
+    "config",
+    [
+        # DDP: ep == 1 since ep * moe_dp == dp == moe_dp; sp == 1 since sp * dp == moe_dp == dp
+        (0, 1, 2, 4, 1),
+        (0, 1, 4, 2, 1),
+        (0, 1, 1, 4, 1),
+        (0, 1, 4, 1, 1),
+        # zero 1:
+        (1, 2, 1, 1, 2),
+        (1, 2, 1, 4, 1),
+        (1, 1, 1, 2, 2),
+        (1, 2, 2, 2, 1),
+        # zero 2
+        (2, 2, 1, 1, 2),
+        (2, 2, 1, 4, 1),
+        (2, 1, 1, 2, 2),
+        (2, 2, 2, 2, 1),
+    ],
+)
+def run_deepseek_3d_test(config: Tuple[int, ...]):
+    run_deepseek_commom(config)
+
+
+def check_deepseek(rank, world_size, port):
+    colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
+    run_deepseek_test()
+
+
+def check_deepseek_3d(rank, world_size, port):
     colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-    run_zero_with_original_model()
+    run_deepseek_3d_test()
 
 
 @pytest.mark.dist
 @pytest.mark.parametrize("world_size", [4])
 @rerun_if_address_is_in_use()
 def test_deepseek(world_size):
-    spawn(run_dist, world_size)
+    spawn(check_deepseek, world_size)
+
+
+@pytest.mark.largedist
+@pytest.mark.parametrize("world_size", [8])
+@rerun_if_address_is_in_use()
+def test_deepseek_3d(world_size):
+    spawn(check_deepseek_3d, world_size)
 
 
 if __name__ == "__main__":
-    test_deepseek(world_size=4)
+    test_deepseek(world_size=8)
+    test_deepseek_3d(world_size=8)
diff --git a/tests/test_shardformer/test_model/test_shard_mixtral.py b/tests/test_shardformer/test_model/test_shard_mixtral.py
index de09eedcbed5..940c66cf637b 100644
--- a/tests/test_shardformer/test_model/test_shard_mixtral.py
+++ b/tests/test_shardformer/test_model/test_shard_mixtral.py
@@ -13,42 +13,25 @@
 import colossalai
 from colossalai.booster.booster import Booster
 from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
+from colossalai.shardformer.layer.utils import Randomizer
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
 from colossalai.testing.random import seed_all
 from tests.test_moe.moe_utils import assert_loose_close, check_model_equal
 
 NUM_BATCH = 8
-NUM_TOK_PER_BATCH, NUM_EXPERTS = 4, 4
+NUM_TOK_PER_BATCH, NUM_EXPERTS = 64, 4
 NUM_LAYERS = 4
 HIDDEN_SIZE_PER_HEAD = 4
-NUM_HEADS = 4
-TOP_K = 1
+NUM_HEADS = 8
+TOP_K = 2
 
-CHECKED_CONFIG = [  # FOR WORLD=4
-    (0, 1, 4, 1, 1),
-    (0, 1, 1, 4, 1),
-    (0, 1, 1, 1, 4),
-    (1, 4, 1, 1, 1),
-    (1, 1, 4, 1, 1),
-    (1, 1, 1, 4, 1),
-    (1, 1, 1, 1, 4),
-    (1, 2, 1, 1, 1),
-]
 
-
-@parameterize(
-    "config",
-    [
-        (1, 2, 2, 1, 1),
-        (1, 2, 1, 2, 1),
-        (1, 2, 1, 1, 2),
-    ],
-)
-def run_zero_with_original_model(config: Tuple[int, ...]):
+def run_mixtral_commom(config: Tuple[int, ...]):
+    Randomizer.reset_index()
     stage, ep_size, pp_size, tp_size, sp_size = config
     world_size = dist.get_world_size()
     rank = dist.get_rank()
-    dtype, precision = torch.float16, "fp16"
+    dtype, precision = torch.bfloat16, "bf16"
     torch.cuda.set_device(dist.get_rank())
 
     plugin = MoeHybridParallelPlugin(
@@ -165,7 +148,7 @@ def run_zero_with_original_model(config: Tuple[int, ...]):
     dist.barrier()
 
     saved_model = MixtralModel.from_pretrained(model_dir).cuda().to(dtype)
-    check_model_equal(torch_model, saved_model)
+    check_model_equal(torch_model, saved_model, dtype=dtype)
     dist.barrier()
 
     if rank == world_size - 1:
@@ -174,17 +157,78 @@ def run_zero_with_original_model(config: Tuple[int, ...]):
     print(f"rank {dist.get_rank()} test passed")
 
 
-def run_dist(rank, world_size, port):
+@parameterize(
+    "config",
+    [
+        # DDP: ep == 1 since ep * moe_dp == dp == moe_dp; sp == 1 since sp * dp == moe_dp == dp
+        (0, 1, 4, 1, 1),
+        (0, 1, 1, 4, 1),
+        (0, 1, 2, 2, 1),
+        # zero 1
+        (1, 4, 1, 1, 1),
+        (1, 1, 4, 1, 1),
+        (1, 1, 1, 4, 1),
+        (1, 2, 1, 1, 2),
+        # zero 2
+        (2, 4, 1, 1, 1),
+        (2, 1, 4, 1, 1),
+        (2, 1, 1, 4, 1),
+        (2, 2, 1, 1, 2),
+    ],
+)
+def run_mixtral_test(config: Tuple[int, ...]):
+    run_mixtral_commom(config)
+
+
+@parameterize(
+    "config",
+    [
+        # DDP: ep == 1 since ep * moe_dp == dp == moe_dp; sp == 1 since sp * dp == moe_dp == dp
+        (0, 1, 2, 4, 1),
+        (0, 1, 4, 2, 1),
+        (0, 1, 1, 4, 1),
+        (0, 1, 4, 1, 1),
+        # zero 1:
+        (1, 2, 1, 1, 2),
+        (1, 2, 1, 4, 1),
+        (1, 1, 1, 2, 2),
+        (1, 2, 2, 2, 1),
+        # zero 2
+        (2, 2, 1, 1, 2),
+        (2, 2, 1, 4, 1),
+        (2, 1, 1, 2, 2),
+        (2, 2, 2, 2, 1),
+    ],
+)
+def run_mixtral_3d_test(config: Tuple[int, ...]):
+    print(f"{config=}")
+    run_mixtral_commom(config)
+
+
+def check_mixtral(rank, world_size, port):
+    colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
+    run_mixtral_test()
+
+
+def check_mixtral_3d(rank, world_size, port):
     colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-    run_zero_with_original_model()
+    run_mixtral_3d_test()
 
 
 @pytest.mark.dist
 @pytest.mark.parametrize("world_size", [4])
 @rerun_if_address_is_in_use()
 def test_mixtral(world_size):
-    spawn(run_dist, world_size)
+    spawn(check_mixtral, world_size)
+
+
+@pytest.mark.largedist
+@pytest.mark.parametrize("world_size", [8])
+@rerun_if_address_is_in_use()
+def test_mixtral_3d(world_size):
+    spawn(check_mixtral_3d, world_size)
 
 
 if __name__ == "__main__":
-    test_mixtral(world_size=4)
+    test_mixtral(world_size=8)
+    test_mixtral_3d(world_size=8)

From 13946c4448ccd2ce981b192190251348ccc8a302 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Wed, 11 Sep 2024 16:11:25 +0800
Subject: [PATCH 074/167] [fp8] hotfix backward hook (#6053)

* [fp8] hotfix backward hook

* [fp8] hotfix pipeline loss accumulation
---
 .../booster/plugin/hybrid_parallel_plugin.py  | 22 ++++++++++---------
 .../booster/plugin/low_level_zero_plugin.py   |  8 ++++---
 colossalai/initialize.py                      |  6 +++++
 .../pipeline/schedule/interleaved_pp.py       |  2 +-
 colossalai/pipeline/schedule/one_f_one_b.py   |  2 +-
 colossalai/zero/low_level/low_level_optim.py  |  8 +++++--
 6 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index 6a333862a909..8e972d0146da 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -216,7 +216,7 @@ def forward(self, *args, **kwargs):
         if self.convert_fn is not None:
             args = tree_map(self.convert_fn, args)
             kwargs = tree_map(self.convert_fn, kwargs)
-        with self._wait_all_gather():
+        with self._hook_context():
             return super().forward(*args, **kwargs)
 
     def unwrap(self):
@@ -229,12 +229,8 @@ def _force_wait_all_gather(self):
         for p in self.module.parameters():
             wait_all_gather_handle(p)
 
-    def _wait_all_gather(self):
-        return (
-            ColoParamOpHookManager.use_hooks(*self.op_hooks)
-            if (self.overlap_allgather or self.use_fp8)
-            else nullcontext()
-        )
+    def _hook_context(self):
+        return ColoParamOpHookManager.use_hooks(*self.op_hooks) if len(self.op_hooks) > 0 else nullcontext()
 
 
 def get_param_info(optim: Optimizer):
@@ -317,7 +313,8 @@ def backward(self, loss: Tensor, *args, **kwargs):
         """
 
         # Call the superclass backward method to compute gradients.
-        super().backward(loss, *args, **kwargs)
+        with self.model._hook_context():
+            super().backward(loss, *args, **kwargs)
 
         if self.model.require_grad_sync:
             # If gradient synchronization is required, sync sequence parallelism gradients.
@@ -540,7 +537,8 @@ def backward(self, loss: Tensor, *args, **kwargs):
             None
         """
         # Call the superclass backward method to compute gradients.
-        super().backward(loss, *args, **kwargs)
+        with self.model._hook_context():
+            super().backward(loss, *args, **kwargs)
 
         if self.model.require_grad_sync:
             # If gradient synchronization is required, sync sequence parallelism gradients.
@@ -683,6 +681,7 @@ def __init__(
         pp_process_group: Optional[ProcessGroup] = None,  # if using pp
         forced_dtype: Optional[torch.dtype] = None,
         overlap_allgather: bool = False,
+        fp8_communication: bool = False,
     ):
         self.model = model
         self.param_info = param_info
@@ -712,6 +711,8 @@ def __init__(
             dp_process_group=dp_process_group,
             forced_dtype=forced_dtype,
             overlap_allgather=overlap_allgather,
+            fp8_communication=fp8_communication,
+            backward_context=model._hook_context,
         )
 
     def sync_dp_grads(self):
@@ -1206,6 +1207,7 @@ def __init__(
             partition_grad=(self.zero_stage == 2),
             forced_dtype=PRECISION_TORCH_TYPE[precision],
             overlap_allgather=overlap_allgather,
+            fp8_communication=fp8_communication,
         )
 
         self.max_norm = max_norm
@@ -1371,7 +1373,7 @@ def execute_pipeline(
         # so we disable it, performing manual reduction instead.
         ctx = optimizer.no_sync() if isinstance(optimizer, HybridParallelZeroOptimizer) else model.no_sync()
 
-        with ctx, model._wait_all_gather():
+        with ctx, model._hook_context():
             outputs = self.schedule.forward_backward_step(
                 model, data_iter, criterion, optimizer, return_loss, return_outputs
             )
diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py
index 8cc511a5610f..cec15dd5dd34 100644
--- a/colossalai/booster/plugin/low_level_zero_plugin.py
+++ b/colossalai/booster/plugin/low_level_zero_plugin.py
@@ -100,14 +100,16 @@ def forward(self, *args, **kwargs):
         if self.convert_fn is not None:
             args = tree_map(self.convert_fn, args)
             kwargs = tree_map(self.convert_fn, kwargs)
-        ctx = ColoParamOpHookManager.use_hooks(*self.op_hooks) if self.overlap_allgather else nullcontext()
-        with ctx:
+        with self._hook_context():
             return super().forward(*args, **kwargs)
 
     def _force_wait_all_gather(self):
         for p in self.module.parameters():
             wait_all_gather_handle(p)
 
+    def _hook_context(self):
+        return ColoParamOpHookManager.use_hooks(*self.op_hooks) if len(self.op_hooks) > 0 else nullcontext()
+
 
 class LowLevelZeroCheckpointIO(TorchDDPCheckpointIO):
     def save_unsharded_optimizer(self, optimizer: OptimizerWrapper, checkpoint: str, gather_dtensor: bool = False):
@@ -520,7 +522,7 @@ def configure(
 
         if optimizer is not None and not isinstance(optimizer, OptimizerWrapper):
             optimizer: LowLevelZeroOptimizer = LowLevelZeroOptimizer(
-                optimizer, **zero_optim_kwargs, verbose=self.verbose
+                optimizer, **zero_optim_kwargs, verbose=self.verbose, backward_context=model._hook_context
             )
             # inject update_master_params
             model.update_master_params = MethodType(optimizer.update_master_params, model)
diff --git a/colossalai/initialize.py b/colossalai/initialize.py
index 4e2eff7ce352..5414791461c6 100644
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@@ -9,6 +9,7 @@
 # https://forums.developer.nvidia.com/t/how-many-streams-maximum-number-of-streams/6571/16
 os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
 
+import torch
 import torch.distributed as dist
 
 from colossalai.accelerator import get_accelerator
@@ -64,6 +65,11 @@ def launch(
 
     set_seed(seed)
 
+    try:
+        torch._dynamo.config.optimize_ddp = world_size > 1
+    except AttributeError:
+        pass
+
     if verbose:
         logger = get_dist_logger()
         logger.info(f"Distributed environment is initialized, world size: {dist.get_world_size()}", ranks=[0])
diff --git a/colossalai/pipeline/schedule/interleaved_pp.py b/colossalai/pipeline/schedule/interleaved_pp.py
index 28c4eb8d8d4a..c538ee0715b4 100644
--- a/colossalai/pipeline/schedule/interleaved_pp.py
+++ b/colossalai/pipeline/schedule/interleaved_pp.py
@@ -318,7 +318,7 @@ def forward_step(
             if self.stage_manager.is_last_stage():
                 loss = criterion(output_obj, micro_batch) / self.num_microbatch
                 if accum_loss is not None:
-                    accum_loss.add_(loss.detach())
+                    accum_loss.add_(loss.data)
                 if outputs is not None:
                     outputs.append(tree_map(detach, output_obj))
                 return loss
diff --git a/colossalai/pipeline/schedule/one_f_one_b.py b/colossalai/pipeline/schedule/one_f_one_b.py
index 67aaa5eb1b59..0fc90995adcc 100644
--- a/colossalai/pipeline/schedule/one_f_one_b.py
+++ b/colossalai/pipeline/schedule/one_f_one_b.py
@@ -273,7 +273,7 @@ def forward_step(
             loss = criterion(output_obj, micro_batch) / self.num_microbatches
 
             if accum_loss is not None:
-                accum_loss.add_(loss.detach())
+                accum_loss.add_(loss.data)
             if outputs is not None:
                 outputs.append(tree_map_hf(detach, output_obj))
             return loss
diff --git a/colossalai/zero/low_level/low_level_optim.py b/colossalai/zero/low_level/low_level_optim.py
index 458e6e41a29e..ed51c2bacafc 100644
--- a/colossalai/zero/low_level/low_level_optim.py
+++ b/colossalai/zero/low_level/low_level_optim.py
@@ -1,6 +1,6 @@
 # this code is inspired by the DeepSpeed library and implemented with our own design from scratch
 import copy
-from contextlib import contextmanager
+from contextlib import contextmanager, nullcontext
 from functools import partial
 from typing import Dict, Iterator, List, Optional, Tuple
 from weakref import proxy
@@ -88,6 +88,7 @@ def __init__(
         master_weights: bool = True,  # master weights
         overlap_allgather: bool = False,
         fp8_communication: bool = False,
+        backward_context=None,
     ):
         super(LowLevelZeroOptimizer, self).__init__(optim=optimizer)
 
@@ -130,6 +131,7 @@ def __init__(
         self._reduce_bucket_size = reduce_bucket_size
         self._communication_dtype = communication_dtype
         self._fp8_communication = fp8_communication
+        self._backward_context = backward_context
 
         # gradient clipping
         self._clip_grad_norm = clip_grad_norm
@@ -429,7 +431,9 @@ def backward(self, loss, retain_graph=False):
         if self.mixed_precision_mixin is not None:
             loss = self.mixed_precision_mixin.pre_backward(loss)
 
-        loss.backward(retain_graph=retain_graph)
+        ctx = nullcontext() if self._backward_context is None else self._backward_context()
+        with ctx:
+            loss.backward(retain_graph=retain_graph)
 
         if not self.require_grad_sync:
             return

From a35a078f0849cf1c805dbae94abe5476b1615ca0 Mon Sep 17 00:00:00 2001
From: flybird11111 <1829166702@qq.com>
Date: Wed, 11 Sep 2024 17:25:14 +0800
Subject: [PATCH 075/167] [doc] update sp doc (#6055)

* update sp doc

* fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix

* fix

* fix

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../en/concepts/paradigms_of_parallelism.md   |  19 +++
 .../en/features/sequence_parallelism.md       | 156 ++++++++++++++++++
 .../concepts/paradigms_of_parallelism.md      |  20 +++
 .../zh-Hans/features/sequence_parallelism.md  | 155 +++++++++++++++++
 4 files changed, 350 insertions(+)
 create mode 100644 docs/source/en/features/sequence_parallelism.md
 create mode 100644 docs/source/zh-Hans/features/sequence_parallelism.md

diff --git a/docs/source/en/concepts/paradigms_of_parallelism.md b/docs/source/en/concepts/paradigms_of_parallelism.md
index 1a5dab7a76f7..80f48e44a5dc 100644
--- a/docs/source/en/concepts/paradigms_of_parallelism.md
+++ b/docs/source/en/concepts/paradigms_of_parallelism.md
@@ -87,6 +87,24 @@ Related paper:
 - [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
 - [Chimera: Efficiently Training Large-Scale Neural Networks with Bidirectional Pipelines](https://arxiv.org/abs/2107.06925)
 
+###  Sequence Parallelism
+Sequence parallelism is a parallel strategy that partitions along the sequence dimension, making it an effective method for training long text sequences. Mature sequence parallelism methods include Megatron’s sequence parallelism, DeepSpeed-Ulysses sequence parallelism, and ring-attention sequence parallelism.
+
+#### Megatron SP:
+This sequence parallelism method is implemented on top of tensor parallelism. On each GPU in model parallelism, the samples are independent and replicated. For parts that cannot utilize tensor parallelism, such as non-linear operations like LayerNorm, the sample data can be split into multiple parts along the sequence dimension, with each GPU computing a portion of the data. Then, tensor parallelism is used for the linear parts like attention and MLP, where activations need to be aggregated. This approach further reduces activation memory usage when the model is partitioned. It is important to note that this sequence parallelism method can only be used in conjunction with tensor parallelism.
+
+#### DeepSpeed-Ulysses:
+In this sequence parallelism, samples are split along the sequence dimension and the all-to-all communication operation is used, allowing each GPU to receive the full sequence but only compute the non-overlapping subset of attention heads, thereby achieving sequence parallelism. This parallel method supports fully general attention, allowing both dense and sparse attention.
+all-to-all is a full exchange operation, similar to a distributed transpose operation. Before attention computation, samples are split along the sequence dimension, so each device only has a sequence length of N/P. However, after using all-to-all, the shape of the qkv subparts becomes [N, d/p], ensuring the overall sequence is considered during attention computation.
+
+#### Ring Attention:
+Ring attention is conceptually similar to flash attention. Each GPU computes only a local attention, and finally, the attention blocks are reduced to calculate the total attention. In Ring Attention, the input sequence is split into multiple chunks along the sequence dimension, with each chunk handled by a different GPU or processor. Ring Attention employs a strategy called "ring communication," where kv sub-blocks are passed between GPUs through p2p communication for iterative computation, enabling multi-GPU training on ultra-long texts. In this strategy, each processor exchanges information only with its predecessor and successor, forming a ring network. This allows intermediate results to be efficiently transmitted between processors without global synchronization, reducing communication overhead.
+
+Related paper：
+[Reducing Activation Recomputation in Large Transformer Models](https://arxiv.org/pdf/2205.05198)
+[DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models](https://arxiv.org/abs/2309.14509)
+[Ring Attention with Blockwise Transformers for Near-Infinite Context](https://arxiv.org/pdf/2310.01889)
+
 
 ## Optimizer-Level Parallel
 
@@ -122,3 +140,4 @@ Related paper:
 - [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840)
 - [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857)
 - [PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management](https://arxiv.org/abs/2108.05818)
+<!-- doc-test-command: echo  -->
diff --git a/docs/source/en/features/sequence_parallelism.md b/docs/source/en/features/sequence_parallelism.md
new file mode 100644
index 000000000000..70fd2eb10970
--- /dev/null
+++ b/docs/source/en/features/sequence_parallelism.md
@@ -0,0 +1,156 @@
+# Sequence Parallelism
+
+Author: Mingyan Jiang
+
+**Prerequisite Tutorials**
+- [Paradigms of Parallelism](../concepts/paradigms_of_parallelism.md)
+- [Booster API](../basics/booster_api.md)
+- [Shardformer](../features/shardformer.md)
+- [Booster plugin](../basics/booster_plugins.md)
+
+**Example Code**
+- [Using Sequence Parallelism Strategy](https://github.com/hpcaitech/ColossalAI/blob/main/examples/language/llama/benchmark.py)
+
+**Related Papers**
+[Reducing Activation Recomputation in Large Transformer Models](https://arxiv.org/pdf/2205.05198)
+[DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models](https://arxiv.org/abs/2309.14509)
+[Ring Attention with Blockwise Transformers for Near-Infinite Context](https://arxiv.org/pdf/2310.01889)
+
+## Quick Overview
+
+In this tutorial, you will learn how to use sequence parallelism. In Colossal-AI, we have implemented several types of sequence parallelism, including TP+SP, DeepSpeed-Ulysses, and ring attention. Below, we will introduce how to use these different types of sequence parallelism.
+
+## Table Of Content
+
+In this tutorial, we will cover the use of three sequence parallelism strategies:
+
+1. Using TP+SP;
+2. Using DeepSpeed-Ulysses;
+3. Using ring attention.
+
+
+## Implementation in Colossal-AI
+
+In Colossal-AI, sequence parallelism is implemented via the shardformer and can be invoked through the `HybridParallelPlugin` and `MoeHybridParallelPlugin` interfaces. For more information about the plugins, refer to the [plugin usage documentation](../basics/booster_plugins.md).
+
+### Using Sequence Parallelism with HybridParallelPlugin
+
+The `HybridParallelPlugin` supports three types of sequence parallelism: TP+SP, DeepSpeed-Ulysses, and ring attention. You can refer to the parallel techniques introduction [document](../concepts/paradigms_of_parallelism.md) for more details. An [example](https://github.com/hpcaitech/ColossalAI/blob/main/examples/language/llama/benchmark.py) of sequence parallelism with HybridParallelPlugin can be found here.
+
+#### Defining Model Components
+
+```python
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM
+from transformers.models.llama.configuration_llama import LlamaConfig
+from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
+import torch.distributed as dist
+from colossalai.booster import Booster
+config = LlamaConfig(max_position_embeddings=4096)
+from colossalai.booster.plugin import HybridParallelPlugin
+
+# define dataset
+class RandomDataset(Dataset):
+    def __init__(self, num_samples: int = 1000, max_length: int = 2048, vocab_size: int = 32000):
+        self.num_samples = num_samples
+        self.max_length = max_length
+        self.input_ids = torch.randint(
+            0, vocab_size, (num_samples, max_length), device=get_accelerator().get_current_device()
+        )
+        self.attention_mask = torch.ones_like(self.input_ids)
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        return {
+            "input_ids": self.input_ids[idx],
+            "attention_mask": self.attention_mask[idx],
+            "labels": self.input_ids[idx],
+        }
+
+parser = argparse.ArgumentParser()
+parser.add_argument("-b", "--batch_size", type=int, default=2, help="Batch size")
+parser.add_argument("-s", "--num_steps", type=int, default=5, help="Number of steps to run")
+parser.add_argument("-l", "--max_length", type=int, default=4096, help="Max sequence length")
+parser.add_argument("--tp", type=int, default=1, help="Tensor parallel size")
+parser.add_argument("--sp", type=int, default=1, help="Sequence parallel size")
+args = parser.parse_args()
+
+model = AutoModelForCausalLM.from_config(
+    config,
+    trust_remote_code=True,
+    attn_implementation="flash_attention_2",
+    torch_dtype=torch.bfloat16,
+)
+optimizer = HybridAdam(model.parameters())
+scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
+# usually, num_samples=args.batch_size * args.num_steps * dp_size
+dataset = RandomDataset(
+        num_samples=10000, max_length=args.max_length, vocab_size=config.vocab_size
+    )
+```
+### Using TP+SP
+Define the plugin. When using this sequence parallelism, sp_size will be set to match tp_size, and the tp group will overlap with the sp group.
+```python
+plugin = HybridParallelPlugin(
+            tp_size=4,
+            sp_size=1,
+            enable_all_optimization=True,
+            enable_sequence_parallelism=True,
+            sequence_parallelism_mode="split_gather",
+        )
+```
+
+#### Using DeepSpeed-Ulysses
+Define the plugin. In the DeepSpeed-Ulysses sequence parallelism, the tp group and sp group are orthogonal.
+```python
+plugin = HybridParallelPlugin(
+            tp_size=2,
+            sp_size=2,
+            enable_all_optimization=True,
+            enable_sequence_parallelism=True,
+            sequence_parallelism_mode="all_to_all",
+        )
+```
+
+#### Using Ring Attention
+Define the plugin. In ring attention sequence parallelism, the tp group and sp group are orthogonal, and sp_size must be set to the correct parallel size.
+```python
+plugin = HybridParallelPlugin(
+            tp_size=2,
+            sp_size=2,
+            enable_all_optimization=True,
+            enable_sequence_parallelism=True,
+            sequence_parallelism_mode="ring_attn",
+        )
+```
+#### Using Booster
+```python
+booster = Booster(plugin=plugin)
+dataloader = plugin.prepare_dataloader(dataset, batch_size=args.batch_size, shuffle=True, drop_last=True, seed=42)
+model, optimizer, _, dataloader, _ = booster.boost(model, optimizer, dataloader=dataloader)
+```
+
+#### Training the Model
+```python
+for step, batch in enumerate(tqdm(dataloader, desc="Step", disable=not dist.get_rank()==0)):
+    outputs = model(**batch)
+    loss = outputs[0]
+    del outputs  # free memory
+
+    if dist.get_rank() == dist.get_world_size() - 1:
+        print(f"Step {step} loss: {loss}")
+    booster.backward(loss, optimizer)
+    optimizer.step()
+    optimizer.zero_grad()
+```
+### Sequence Parallelism with MoeHybridParallelPlugin
+Currently, the `MoeHybridParallelPlugin` only supports DeepSpeed-Ulysses sequence parallelism. The usage is similar to HybridParallelPlugin. For specific examples, refer to this [example](https://github.com/hpcaitech/ColossalAI/blob/main/examples/language/deepseek/benchmark.py).
+
+
+
+### Conclusion
+Among the sequence parallelism methods mentioned, ring attention has no requirements for the number of attention heads and can train ultra-long sequences. However, due to the division of computation, its performance may decrease. TP+SP and DeepSpeed-Ulysses have requirements for the number of attention heads, which must be divisible by the sp group size. These sequence parallelism methods are all compatible with high-performance attention mechanisms like flash attention. Sequence parallelism can also be used with Gemini to train extremely large-scale models, and it can be combined with TP, PP, and DP to form 4D parallelism.
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=4 sequence_parallelism.py  -->
diff --git a/docs/source/zh-Hans/concepts/paradigms_of_parallelism.md b/docs/source/zh-Hans/concepts/paradigms_of_parallelism.md
index 8f52d28ecdf4..b24349d0689c 100755
--- a/docs/source/zh-Hans/concepts/paradigms_of_parallelism.md
+++ b/docs/source/zh-Hans/concepts/paradigms_of_parallelism.md
@@ -62,6 +62,25 @@
 - [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
 - [Chimera: Efficiently Training Large-Scale Neural Networks with Bidirectional Pipelines](https://arxiv.org/abs/2107.06925)
 
+### 序列并行
+序列并行是一种对于序列维度进行切分的并行策略，它是训练长文本序列的有效方法。现成熟的序列并行方法包括megatron提出的序列并行，DeepSpeed-Ulysses序列并行和ring-attention序列并行等。
+#### megatron sp:
+
+该序列并行方法是在张量并行的基础上实现的序列并行，模型并行的每个gpu上，样本独立且重复的，对于非线性运算的部分如layernorm等无法使用张量并行的模块，可以在序列维度将样本数据切分为多个部分，每个gpu计算部分数据，然后在计算attention及mlp等线性部分使用张量并行策略，需要将activation汇总，这样可以在模型进行切分的情况下进一步减少activation的内存占用，需要注意的是该序列并行方法只能与张量并行一起使用。
+
+#### DeepSpeed-Ulysses:
+
+序列并行通过在序列维度上分割样本并利用all-to-all通信操作，使每个GPU接收完整序列但仅计算注意力头的非重叠子集，从而实现序列并行。该并行方法具有完全通用的attention，可支持密集和稀疏的注意力。
+alltoall是一个全交换操作，相当于分布式转置的操作，在attention计算之前，将样本沿序列维度进行切分，每个设备只有N/P的序列长度，然而使用alltoall后，qkv的子部分shape变为[N, d/p]，在计算attention时仍考虑了整体的序列。
+#### ring attention：
+
+ring attention思路类似于flash attention，每个GPU只计算一个局部的attention，最后将所有的attention块结果进行归约计算出总的attention。在Ring Attention中，输入序列被沿着序列维度切分为多个块，每个块由不同的GPU或处理器负责处理，Ring Attention采用了一种称为“环形通信”的策略，通过跨卡的p2p通信相互传递kv子块来实现迭代计算，可以实现多卡的超长文本。在这种策略下，每个处理器只与它的前一个和后一个处理器交换信息，形成一个环形网络。通过这种方式，中间结果可以在处理器之间高效传递，而无需全局同步，减少了通信开销。
+
+相关论文：
+[Reducing Activation Recomputation in Large Transformer Models](https://arxiv.org/pdf/2205.05198)
+[DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models](https://arxiv.org/abs/2309.14509)
+[Ring Attention with Blockwise Transformers for Near-Infinite Context](https://arxiv.org/pdf/2310.01889)
+
 
 ## 优化器相关的并行
 
@@ -90,3 +109,4 @@
 - [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840)
 - [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857)
 - [PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management](https://arxiv.org/abs/2108.05818)
+<!-- doc-test-command: echo  -->
diff --git a/docs/source/zh-Hans/features/sequence_parallelism.md b/docs/source/zh-Hans/features/sequence_parallelism.md
new file mode 100644
index 000000000000..534035cb5abf
--- /dev/null
+++ b/docs/source/zh-Hans/features/sequence_parallelism.md
@@ -0,0 +1,155 @@
+# 序列并行
+
+作者: Mingyan Jiang
+
+**前置教程**
+- [并行技术](../concepts/paradigms_of_parallelism.md)
+- [Booster API](../basics/booster_api.md)
+- [Shardformer](../features/shardformer.md)
+- [Booster 插件](../basics/booster_plugins.md)
+
+**示例代码**
+- [使用序列并行策略](https://github.com/hpcaitech/ColossalAI/blob/main/examples/language/llama/benchmark.py)
+
+**相关论文**
+[Reducing Activation Recomputation in Large Transformer Models](https://arxiv.org/pdf/2205.05198)
+[DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models](https://arxiv.org/abs/2309.14509)
+[Ring Attention with Blockwise Transformers for Near-Infinite Context](https://arxiv.org/pdf/2310.01889)
+
+## 快速预览
+
+在本教程中，你将学习如何使用序列并行。在 Colossal-AI 中, 我们实现了包括TP+SP， DeepSpeed-Ulysses， ring attention等多种序列并行. 我们下面将介绍如何使用这几种序列并行。
+
+## 目录
+
+在本教程中，我们将介绍三种序列并行的使用:
+
+1. 使用TP+SP；
+2. 使用DeepSpeed-Ulysses；
+3. 使用ring attention
+
+
+## Colossal-AI中的实现
+
+在 Colossal-AI 中，shardformer实现了序列并行，并通过`HybridParallelPlugin`和`MoeHybridParallelPlugin`接口可进行调用。相关plugin的介绍请参考plugin的[使用文档](../basics/booster_plugins.md)。
+
+### 使用`HybridParallelPlugin`的序列并行
+`HybridParallelPlugin`的序列支持了TP+SP， DeepSpeed-Ulysses， ring attention三种实现，相关序列并行的结束可参考[并行技术介绍文档](../concepts/paradigms_of_parallelism.md)，`HybridParallelPlugin`中的序列并行[例子](https://github.com/hpcaitech/ColossalAI/blob/main/examples/language/llama/benchmark.py)
+
+#### 定义模型相关组件
+
+```python
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM
+from transformers.models.llama.configuration_llama import LlamaConfig
+from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
+import torch.distributed as dist
+from colossalai.booster import Booster
+config = LlamaConfig(max_position_embeddings=4096)
+from colossalai.booster.plugin import HybridParallelPlugin
+
+# 定义数据集
+class RandomDataset(Dataset):
+    def __init__(self, num_samples: int = 1000, max_length: int = 2048, vocab_size: int = 32000):
+        self.num_samples = num_samples
+        self.max_length = max_length
+        self.input_ids = torch.randint(
+            0, vocab_size, (num_samples, max_length), device=get_accelerator().get_current_device()
+        )
+        self.attention_mask = torch.ones_like(self.input_ids)
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        return {
+            "input_ids": self.input_ids[idx],
+            "attention_mask": self.attention_mask[idx],
+            "labels": self.input_ids[idx],
+        }
+
+parser = argparse.ArgumentParser()
+parser.add_argument("-b", "--batch_size", type=int, default=2, help="Batch size")
+parser.add_argument("-s", "--num_steps", type=int, default=5, help="Number of steps to run")
+parser.add_argument("-l", "--max_length", type=int, default=4096, help="Max sequence length")
+parser.add_argument("--tp", type=int, default=1, help="Tensor parallel size")
+parser.add_argument("--sp", type=int, default=1, help="Sequence parallel size")
+args = parser.parse_args()
+
+model = AutoModelForCausalLM.from_config(
+    config,
+    trust_remote_code=True,
+    attn_implementation="flash_attention_2",
+    torch_dtype=torch.bfloat16,
+)
+optimizer = HybridAdam(model.parameters())
+scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
+# usually, num_samples=args.batch_size * args.num_steps * dp_size
+dataset = RandomDataset(
+        num_samples=10000, max_length=args.max_length, vocab_size=config.vocab_size
+    )
+```
+### 使用TP+SP
+定义plugin,使用该序列并行，`sp_size`会被设置为`tp_size`一致，且tp group 与sp group是重叠的。
+```python
+plugin = HybridParallelPlugin(
+            tp_size=4,
+            sp_size=1,
+            enable_all_optimization=True,
+            enable_sequence_parallelism=True,
+            sequence_parallelism_mode="split_gather",
+        )
+```
+
+#### 使用DeepSpeed-Ulysses
+定义plugin， 在DeepSpeed-Ulysses的序列并行种，tp group与sp group 是正交的，
+```python
+plugin = HybridParallelPlugin(
+            tp_size=2,
+            sp_size=2,
+            enable_all_optimization=True,
+            enable_sequence_parallelism=True,
+            sequence_parallelism_mode="all_to_all",
+        )
+```
+
+#### 使用ring attention
+定义plugin， 在ring attention的序列并行种，tp group与sp group 是正交的，sp_size必须传入准确的并行大小。
+```python
+plugin = HybridParallelPlugin(
+            tp_size=2,
+            sp_size=2,
+            enable_all_optimization=True,
+            enable_sequence_parallelism=True,
+            sequence_parallelism_mode="ring_attn",
+        )
+```
+#### 使用booster
+```python
+booster = Booster(plugin=plugin)
+dataloader = plugin.prepare_dataloader(dataset, batch_size=args.batch_size, shuffle=True, drop_last=True, seed=42)
+model, optimizer, _, dataloader, _ = booster.boost(model, optimizer, dataloader=dataloader)
+```
+
+#### 训练模型
+```python
+for step, batch in enumerate(tqdm(dataloader, desc="Step", disable=not dist.get_rank()==0)):
+    outputs = model(**batch)
+    loss = outputs[0]
+    del outputs  # free memory
+
+    if dist.get_rank() == dist.get_world_size() - 1:
+        print(f"Step {step} loss: {loss}")
+    booster.backward(loss, optimizer)
+    optimizer.step()
+    optimizer.zero_grad()
+```
+### 使用`MoeHybridParallelPlugin`的序列并行
+    `MoeHybridParallelPlugin`中的序列并行暂时只支持DeepSpeed-Ulysses类型,使用方法与`HybridParallelPlugin`类似，具体可参考[例子](https://github.com/hpcaitech/ColossalAI/blob/main/examples/language/deepseek/benchmark.py)
+
+
+
+### 结论
+在上述序列并行方法中，ring attention对head number没有要求，可训练超长文本，但是由于细分了计算，计算性能会有所下降。TP+SP， DeepSpeed-Ulysses对于head number有要求，需要可被sp group size 整除。这些序列并行都可与其他高性能注意力兼容，如flash attention。sp可与Gemini一起使用训练超大规模模型，也可以与TP，PP，DP等组成4D并行。
+
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=4 sequence_parallelism.py  -->

From fdd84b9087f6309e3da315559367bac3dd8c05e3 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Fri, 13 Sep 2024 02:32:03 +0000
Subject: [PATCH 076/167] fix the sp

---
 colossalai/kernel/kernel_loader.py   |  2 ++
 colossalai/shardformer/layer/attn.py | 32 ++++++++++++++++++++++++----
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/colossalai/kernel/kernel_loader.py b/colossalai/kernel/kernel_loader.py
index 2411b6482ac1..8598cf0ae0ed 100644
--- a/colossalai/kernel/kernel_loader.py
+++ b/colossalai/kernel/kernel_loader.py
@@ -118,6 +118,8 @@ class FlashAttentionLoader(KernelLoader):
         FlashAttentionSdpaCudaExtension,
     ]
 
+class FlashAttentionDaoLoader(KernelLoader):
+    REGISTRY = [FlashAttentionDaoCudaExtension]
 
 class FlashAttentionWithCustomMaskLoader(KernelLoader):
     REGISTRY = [FlashAttentionNpuExtension, FlashAttentionSdpaCudaExtension]
diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index bf4fa77c6c23..29ef64a4d6fe 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -10,6 +10,7 @@
 from colossalai.kernel.kernel_loader import (
     FlashAttentionForFloatAndCustomMaskLoader,
     FlashAttentionLoader,
+    FlashAttentionDaoLoader,
     FlashAttentionWithCustomMaskLoader,
     KernelLoader,
 )
@@ -17,6 +18,8 @@
 
 from .utils import RingComm, get_half_index, split_varlen_zigzag
 
+MEMORY_BOUND = 10 * 1e9
+
 __all__ = [
     "AttnMaskType",
     "ColoAttention",
@@ -104,7 +107,7 @@ def _init_kernels_dispatch():
             }
 
     @staticmethod
-    def _dispatch_kernel(dtype: torch.dtype, mask_type: Optional[AttnMaskType]) -> Callable:
+    def _dispatch_kernel(dtype: torch.dtype, mask_type: Optional[AttnMaskType], size) -> Callable:
         ColoAttention._init_kernels_dispatch()
         if (
             dtype not in ColoAttention._kernel_dispatch_map
@@ -113,12 +116,16 @@ def _dispatch_kernel(dtype: torch.dtype, mask_type: Optional[AttnMaskType]) -> C
             raise ValueError(
                 "FlashAttention kernel is not available for dtype {} and mask_type {}".format(dtype, mask_type)
             )
+            
+        if size > MEMORY_BOUND:
+            FlashAttentionDaoLoader().load()
         # lazy load
         if isinstance(ColoAttention._kernel_dispatch_map[dtype][mask_type], KernelLoader):
             ColoAttention._kernel_dispatch_map[dtype][mask_type] = ColoAttention._kernel_dispatch_map[dtype][
                 mask_type
             ].load()
-        return ColoAttention._kernel_dispatch_map[dtype][mask_type]
+            
+        return FlashAttentionDaoLoader() if size > MEMORY_BOUND else ColoAttention._kernel_dispatch_map[dtype][mask_type]
 
     @staticmethod
     def prepare_attn_kwargs(
@@ -163,7 +170,7 @@ def prepare_attn_kwargs(
             outputs["attention_mask_type"] = AttnMaskType.CAUSAL
             attention_mask = torch.ones(s_q, s_kv, dtype=dtype, device=device)
             if s_q != 1:
-                attention_mask = attention_mask.tril(diagonal=0)
+                attention_mask.tril_(diagonal=0)
             attention_mask = attention_mask.expand(b, s_q, s_kv)
         else:
             max_seqlen_q, cu_seqlens_q, q_indices = get_pad_info(q_padding_mask)
@@ -197,6 +204,15 @@ def prepare_attn_kwargs(
         if invert:
             attention_mask = invert_mask(attention_mask).unsqueeze(1)
         outputs["attention_mask"] = attention_mask
+        
+        element_size = torch.tensor([], dtype=dtype).element_size()
+        memory_size = (s_q * s_kv * element_size)
+        if memory_size > MEMORY_BOUND:
+            attention_mask = torch.empty((0,), dtype=dtype, device=device)
+            outputs["attention_mask"] = attention_mask
+            if outputs["attention_mask_type"] != AttnMaskType.PADDED_CAUSAL:
+                outputs["attention_mask_type"] = AttnMaskType.CAUSAL
+            
         return outputs
 
     @staticmethod
@@ -278,8 +294,16 @@ def attention(
             assert attention_mask_type == AttnMaskType.CUSTOM
 
         # kernel dispatch
+        b, _, s_q, _ = q.shape
+        b, _, s_kv, _ = v.shape
+        element_size = torch.tensor([], dtype=q.dtype).element_size()
+        memory_size = (s_q * s_kv * element_size)
+        if memory_size > MEMORY_BOUND:
+            attention_mask = torch.empty((0,), dtype=q.dtype, device=q.device)
+            assert attention_mask_type == AttnMaskType.PADDED_CAUSAL or AttnMaskType.PADDED
+            
         mask_type = attention_mask_type if attention_mask is not None else None
-        attn_func = ColoAttention._dispatch_kernel(q.dtype, mask_type)
+        attn_func = ColoAttention._dispatch_kernel(q.dtype, mask_type, memory_size)
         is_causal = attention_mask is not None and attention_mask_type in (
             AttnMaskType.CAUSAL,
             AttnMaskType.PADDED_CAUSAL,

From 216d54e3747fedb3404f181023449182410d48d8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 13 Sep 2024 02:38:39 +0000
Subject: [PATCH 077/167] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 colossalai/kernel/kernel_loader.py   |  2 ++
 colossalai/shardformer/layer/attn.py | 20 +++++++++++---------
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/colossalai/kernel/kernel_loader.py b/colossalai/kernel/kernel_loader.py
index 8598cf0ae0ed..36a49aae918b 100644
--- a/colossalai/kernel/kernel_loader.py
+++ b/colossalai/kernel/kernel_loader.py
@@ -118,9 +118,11 @@ class FlashAttentionLoader(KernelLoader):
         FlashAttentionSdpaCudaExtension,
     ]
 
+
 class FlashAttentionDaoLoader(KernelLoader):
     REGISTRY = [FlashAttentionDaoCudaExtension]
 
+
 class FlashAttentionWithCustomMaskLoader(KernelLoader):
     REGISTRY = [FlashAttentionNpuExtension, FlashAttentionSdpaCudaExtension]
 
diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 29ef64a4d6fe..0a4f985358af 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -8,9 +8,9 @@
 from einops import rearrange
 
 from colossalai.kernel.kernel_loader import (
+    FlashAttentionDaoLoader,
     FlashAttentionForFloatAndCustomMaskLoader,
     FlashAttentionLoader,
-    FlashAttentionDaoLoader,
     FlashAttentionWithCustomMaskLoader,
     KernelLoader,
 )
@@ -116,7 +116,7 @@ def _dispatch_kernel(dtype: torch.dtype, mask_type: Optional[AttnMaskType], size
             raise ValueError(
                 "FlashAttention kernel is not available for dtype {} and mask_type {}".format(dtype, mask_type)
             )
-            
+
         if size > MEMORY_BOUND:
             FlashAttentionDaoLoader().load()
         # lazy load
@@ -124,8 +124,10 @@ def _dispatch_kernel(dtype: torch.dtype, mask_type: Optional[AttnMaskType], size
             ColoAttention._kernel_dispatch_map[dtype][mask_type] = ColoAttention._kernel_dispatch_map[dtype][
                 mask_type
             ].load()
-            
-        return FlashAttentionDaoLoader() if size > MEMORY_BOUND else ColoAttention._kernel_dispatch_map[dtype][mask_type]
+
+        return (
+            FlashAttentionDaoLoader() if size > MEMORY_BOUND else ColoAttention._kernel_dispatch_map[dtype][mask_type]
+        )
 
     @staticmethod
     def prepare_attn_kwargs(
@@ -204,15 +206,15 @@ def prepare_attn_kwargs(
         if invert:
             attention_mask = invert_mask(attention_mask).unsqueeze(1)
         outputs["attention_mask"] = attention_mask
-        
+
         element_size = torch.tensor([], dtype=dtype).element_size()
-        memory_size = (s_q * s_kv * element_size)
+        memory_size = s_q * s_kv * element_size
         if memory_size > MEMORY_BOUND:
             attention_mask = torch.empty((0,), dtype=dtype, device=device)
             outputs["attention_mask"] = attention_mask
             if outputs["attention_mask_type"] != AttnMaskType.PADDED_CAUSAL:
                 outputs["attention_mask_type"] = AttnMaskType.CAUSAL
-            
+
         return outputs
 
     @staticmethod
@@ -297,11 +299,11 @@ def attention(
         b, _, s_q, _ = q.shape
         b, _, s_kv, _ = v.shape
         element_size = torch.tensor([], dtype=q.dtype).element_size()
-        memory_size = (s_q * s_kv * element_size)
+        memory_size = s_q * s_kv * element_size
         if memory_size > MEMORY_BOUND:
             attention_mask = torch.empty((0,), dtype=q.dtype, device=q.device)
             assert attention_mask_type == AttnMaskType.PADDED_CAUSAL or AttnMaskType.PADDED
-            
+
         mask_type = attention_mask_type if attention_mask is not None else None
         attn_func = ColoAttention._dispatch_kernel(q.dtype, mask_type, memory_size)
         is_causal = attention_mask is not None and attention_mask_type in (

From 0a01e2a453abaa802cb839f67f7193af52709350 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Fri, 13 Sep 2024 03:33:08 +0000
Subject: [PATCH 078/167] fix the attn

---
 colossalai/shardformer/layer/attn.py | 44 +++++++++++++++-------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 0a4f985358af..1ffbae73e0a9 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -18,7 +18,7 @@
 
 from .utils import RingComm, get_half_index, split_varlen_zigzag
 
-MEMORY_BOUND = 10 * 1e9
+MEMORY_BOUND = 1 * 1e9
 
 __all__ = [
     "AttnMaskType",
@@ -125,9 +125,10 @@ def _dispatch_kernel(dtype: torch.dtype, mask_type: Optional[AttnMaskType], size
                 mask_type
             ].load()
 
-        return (
-            FlashAttentionDaoLoader() if size > MEMORY_BOUND else ColoAttention._kernel_dispatch_map[dtype][mask_type]
-        )
+        if size >= MEMORY_BOUND and mask_type in (AttnMaskType.PADDED_CAUSAL, AttnMaskType.CAUSAL):
+            return FlashAttentionDaoLoader()
+        else:
+            return ColoAttention._kernel_dispatch_map[dtype][mask_type]
 
     @staticmethod
     def prepare_attn_kwargs(
@@ -163,6 +164,8 @@ def prepare_attn_kwargs(
             return {}
         assert len(shape_4d) == 4 and shape_4d[1] == 1
         b, _, s_q, s_kv = shape_4d
+        element_size = torch.tensor([], dtype=dtype).element_size()
+        memory_size = s_q * s_kv * element_size
         outputs = {}
         if (q_padding_mask is None or q_padding_mask.bool().all()) and (
             kv_padding_mask is None or kv_padding_mask.bool().all()
@@ -170,10 +173,13 @@ def prepare_attn_kwargs(
             # no padding
             assert is_causal
             outputs["attention_mask_type"] = AttnMaskType.CAUSAL
-            attention_mask = torch.ones(s_q, s_kv, dtype=dtype, device=device)
-            if s_q != 1:
-                attention_mask.tril_(diagonal=0)
-            attention_mask = attention_mask.expand(b, s_q, s_kv)
+            if memory_size > MEMORY_BOUND:
+                attention_mask = torch.empty((0,), dtype=dtype, device=device)
+            else:
+                attention_mask = torch.ones(s_q, s_kv, dtype=dtype, device=device)
+                if s_q != 1:
+                    attention_mask.tril_(diagonal=0)
+                attention_mask = attention_mask.expand(b, s_q, s_kv)
         else:
             max_seqlen_q, cu_seqlens_q, q_indices = get_pad_info(q_padding_mask)
             if kv_padding_mask is None:
@@ -186,7 +192,10 @@ def prepare_attn_kwargs(
                 b,
                 s_kv,
             ), f"Padding mask shape {kv_padding_mask.shape} should align with shape 4d ({b}, {s_kv})"
-            attention_mask = kv_padding_mask[:, None, :].expand(b, s_q, s_kv).to(dtype=dtype, device=device)
+            if memory_size > MEMORY_BOUND:
+                attention_mask = torch.empty((0,), dtype=dtype, device=device)
+            else:
+                attention_mask = kv_padding_mask[:, None, :].expand(b, s_q, s_kv).to(dtype=dtype, device=device)
             outputs.update(
                 {
                     "cu_seqlens_q": cu_seqlens_q,
@@ -199,22 +208,16 @@ def prepare_attn_kwargs(
             )
             if is_causal:
                 outputs["attention_mask_type"] = AttnMaskType.PADDED_CAUSAL
-                if s_q != 1:
-                    attention_mask = attention_mask * attention_mask.new_ones(s_q, s_kv).tril(diagonal=0)
+                if memory_size > MEMORY_BOUND:
+                    attention_mask = torch.empty((0,), dtype=dtype, device=device)
+                else:
+                    if s_q != 1:
+                        attention_mask = attention_mask * attention_mask.new_ones(s_q, s_kv).tril(diagonal=0)
             else:
                 outputs["attention_mask_type"] = AttnMaskType.PADDED
         if invert:
             attention_mask = invert_mask(attention_mask).unsqueeze(1)
         outputs["attention_mask"] = attention_mask
-
-        element_size = torch.tensor([], dtype=dtype).element_size()
-        memory_size = s_q * s_kv * element_size
-        if memory_size > MEMORY_BOUND:
-            attention_mask = torch.empty((0,), dtype=dtype, device=device)
-            outputs["attention_mask"] = attention_mask
-            if outputs["attention_mask_type"] != AttnMaskType.PADDED_CAUSAL:
-                outputs["attention_mask_type"] = AttnMaskType.CAUSAL
-
         return outputs
 
     @staticmethod
@@ -301,7 +304,6 @@ def attention(
         element_size = torch.tensor([], dtype=q.dtype).element_size()
         memory_size = s_q * s_kv * element_size
         if memory_size > MEMORY_BOUND:
-            attention_mask = torch.empty((0,), dtype=q.dtype, device=q.device)
             assert attention_mask_type == AttnMaskType.PADDED_CAUSAL or AttnMaskType.PADDED
 
         mask_type = attention_mask_type if attention_mask is not None else None

From 683179cefda2cb71c11d1ec83f75a4b9251ac0b0 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Fri, 13 Sep 2024 03:40:56 +0000
Subject: [PATCH 079/167] fix

---
 colossalai/shardformer/layer/attn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 1ffbae73e0a9..65051a61fe07 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -18,7 +18,7 @@
 
 from .utils import RingComm, get_half_index, split_varlen_zigzag
 
-MEMORY_BOUND = 1 * 1e9
+MEMORY_BOUND = 10 * 1e9
 
 __all__ = [
     "AttnMaskType",

From 6eb8832366c76187350059985d780acebbcd9a2d Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Fri, 13 Sep 2024 05:06:56 +0000
Subject: [PATCH 080/167] fix

---
 colossalai/shardformer/layer/attn.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 65051a61fe07..c18d57de1215 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -117,7 +117,7 @@ def _dispatch_kernel(dtype: torch.dtype, mask_type: Optional[AttnMaskType], size
                 "FlashAttention kernel is not available for dtype {} and mask_type {}".format(dtype, mask_type)
             )
 
-        if size > MEMORY_BOUND:
+        if size >= MEMORY_BOUND:
             FlashAttentionDaoLoader().load()
         # lazy load
         if isinstance(ColoAttention._kernel_dispatch_map[dtype][mask_type], KernelLoader):
@@ -173,7 +173,7 @@ def prepare_attn_kwargs(
             # no padding
             assert is_causal
             outputs["attention_mask_type"] = AttnMaskType.CAUSAL
-            if memory_size > MEMORY_BOUND:
+            if memory_size >= MEMORY_BOUND:
                 attention_mask = torch.empty((0,), dtype=dtype, device=device)
             else:
                 attention_mask = torch.ones(s_q, s_kv, dtype=dtype, device=device)
@@ -192,10 +192,10 @@ def prepare_attn_kwargs(
                 b,
                 s_kv,
             ), f"Padding mask shape {kv_padding_mask.shape} should align with shape 4d ({b}, {s_kv})"
-            if memory_size > MEMORY_BOUND:
-                attention_mask = torch.empty((0,), dtype=dtype, device=device)
-            else:
+            if memory_size < MEMORY_BOUND and not is_causal:
                 attention_mask = kv_padding_mask[:, None, :].expand(b, s_q, s_kv).to(dtype=dtype, device=device)
+            else:
+                attention_mask = torch.empty((0,), dtype=dtype, device=device)
             outputs.update(
                 {
                     "cu_seqlens_q": cu_seqlens_q,
@@ -208,7 +208,7 @@ def prepare_attn_kwargs(
             )
             if is_causal:
                 outputs["attention_mask_type"] = AttnMaskType.PADDED_CAUSAL
-                if memory_size > MEMORY_BOUND:
+                if memory_size >= MEMORY_BOUND:
                     attention_mask = torch.empty((0,), dtype=dtype, device=device)
                 else:
                     if s_q != 1:
@@ -303,9 +303,6 @@ def attention(
         b, _, s_kv, _ = v.shape
         element_size = torch.tensor([], dtype=q.dtype).element_size()
         memory_size = s_q * s_kv * element_size
-        if memory_size > MEMORY_BOUND:
-            assert attention_mask_type == AttnMaskType.PADDED_CAUSAL or AttnMaskType.PADDED
-
         mask_type = attention_mask_type if attention_mask is not None else None
         attn_func = ColoAttention._dispatch_kernel(q.dtype, mask_type, memory_size)
         is_causal = attention_mask is not None and attention_mask_type in (

From f393867cff97924e0b90d81758a29bc5a2e94923 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Fri, 13 Sep 2024 05:24:52 +0000
Subject: [PATCH 081/167] fix

---
 colossalai/shardformer/layer/attn.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index c18d57de1215..8890da242a02 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -173,13 +173,13 @@ def prepare_attn_kwargs(
             # no padding
             assert is_causal
             outputs["attention_mask_type"] = AttnMaskType.CAUSAL
-            if memory_size >= MEMORY_BOUND:
-                attention_mask = torch.empty((0,), dtype=dtype, device=device)
-            else:
+            if memory_size < MEMORY_BOUND and not is_causal:
                 attention_mask = torch.ones(s_q, s_kv, dtype=dtype, device=device)
                 if s_q != 1:
                     attention_mask.tril_(diagonal=0)
                 attention_mask = attention_mask.expand(b, s_q, s_kv)
+            else:
+                attention_mask = torch.empty((0,), dtype=dtype, device=device)
         else:
             max_seqlen_q, cu_seqlens_q, q_indices = get_pad_info(q_padding_mask)
             if kv_padding_mask is None:
@@ -208,11 +208,11 @@ def prepare_attn_kwargs(
             )
             if is_causal:
                 outputs["attention_mask_type"] = AttnMaskType.PADDED_CAUSAL
-                if memory_size >= MEMORY_BOUND:
-                    attention_mask = torch.empty((0,), dtype=dtype, device=device)
-                else:
+                if memory_size < MEMORY_BOUND:
                     if s_q != 1:
                         attention_mask = attention_mask * attention_mask.new_ones(s_q, s_kv).tril(diagonal=0)
+                else:
+                    attention_mask = torch.empty((0,), dtype=dtype, device=device)
             else:
                 outputs["attention_mask_type"] = AttnMaskType.PADDED
         if invert:

From dc032172c34538abcdad101997b9637b70ef0552 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Fri, 13 Sep 2024 06:00:58 +0000
Subject: [PATCH 082/167] fix

---
 colossalai/shardformer/layer/attn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 8890da242a02..a2ea761bf3e9 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -173,7 +173,7 @@ def prepare_attn_kwargs(
             # no padding
             assert is_causal
             outputs["attention_mask_type"] = AttnMaskType.CAUSAL
-            if memory_size < MEMORY_BOUND and not is_causal:
+            if memory_size < MEMORY_BOUND:
                 attention_mask = torch.ones(s_q, s_kv, dtype=dtype, device=device)
                 if s_q != 1:
                     attention_mask.tril_(diagonal=0)

From 696fced0d722ab582568fb5b6f6d7dbc536d3053 Mon Sep 17 00:00:00 2001
From: botbw <wang1570@e.ntu.edu.sg>
Date: Fri, 13 Sep 2024 14:30:05 +0800
Subject: [PATCH 083/167] [fp8] fix missing fp8_comm flag in mixtral (#6057)

---
 colossalai/shardformer/modeling/mixtral.py | 7 ++++++-
 colossalai/shardformer/policies/mixtral.py | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/colossalai/shardformer/modeling/mixtral.py b/colossalai/shardformer/modeling/mixtral.py
index 0103808dc6a9..4f8ec162f60d 100644
--- a/colossalai/shardformer/modeling/mixtral.py
+++ b/colossalai/shardformer/modeling/mixtral.py
@@ -31,6 +31,7 @@
     all_to_all_uneven,
 )
 from colossalai.pipeline.stage_manager import PipelineStageManager
+from colossalai.quantization.fp8 import all_reduce_fp8
 from colossalai.shardformer.layer._operation import (
     all_to_all_comm,
     gather_forward_split_backward,
@@ -142,7 +143,11 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             for i in range(1, self.ep_size):
                 activate_experts += output_split_sizes[i * self.num_experts_per_ep : (i + 1) * self.num_experts_per_ep]
             activate_experts = (activate_experts > 0).float()
-        dist.all_reduce(activate_experts, group=self.moe_dp_group)
+
+        if self.fp8_communication:
+            all_reduce_fp8(activate_experts, group=self.moe_dp_group)
+        else:
+            dist.all_reduce(activate_experts, group=self.moe_dp_group)
 
         input_split_list = input_split_sizes.view(self.ep_size, self.num_experts_per_ep).sum(dim=-1).tolist()
         output_split_list = output_split_sizes.view(self.ep_size, self.num_experts_per_ep).sum(dim=-1).tolist()
diff --git a/colossalai/shardformer/policies/mixtral.py b/colossalai/shardformer/policies/mixtral.py
index 9f03319e7e5b..8e2ca5de0556 100644
--- a/colossalai/shardformer/policies/mixtral.py
+++ b/colossalai/shardformer/policies/mixtral.py
@@ -178,6 +178,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                             "ep_group": self.shard_config.ep_group,
                             "tp_group": self.shard_config.tensor_parallel_process_group,
                             "moe_dp_group": self.shard_config.moe_dp_group,
+                            "fp8_communication": self.shard_config.fp8_communication,
                         },
                     )
                 ],

From 0b14a5512e5220450b61effd15ff2d9c93ef7d22 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Fri, 13 Sep 2024 07:06:14 +0000
Subject: [PATCH 084/167] fix

---
 colossalai/shardformer/layer/attn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index a2ea761bf3e9..c755ffa2f211 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -118,7 +118,7 @@ def _dispatch_kernel(dtype: torch.dtype, mask_type: Optional[AttnMaskType], size
             )
 
         if size >= MEMORY_BOUND:
-            FlashAttentionDaoLoader().load()
+            flash_kernel = FlashAttentionDaoLoader().load()
         # lazy load
         if isinstance(ColoAttention._kernel_dispatch_map[dtype][mask_type], KernelLoader):
             ColoAttention._kernel_dispatch_map[dtype][mask_type] = ColoAttention._kernel_dispatch_map[dtype][
@@ -126,7 +126,7 @@ def _dispatch_kernel(dtype: torch.dtype, mask_type: Optional[AttnMaskType], size
             ].load()
 
         if size >= MEMORY_BOUND and mask_type in (AttnMaskType.PADDED_CAUSAL, AttnMaskType.CAUSAL):
-            return FlashAttentionDaoLoader()
+            return flash_kernel
         else:
             return ColoAttention._kernel_dispatch_map[dtype][mask_type]
 

From 0ad3129cb95cb74f13dead3f8369837ea997b499 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Fri, 13 Sep 2024 09:01:26 +0000
Subject: [PATCH 085/167] fix

---
 colossalai/shardformer/layer/attn.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index c755ffa2f211..129b04fb796d 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -80,6 +80,7 @@ def get_pad_info(
 
 class ColoAttention:
     _kernel_dispatch_map: Optional[Dict[torch.dtype, Dict[Optional[AttnMaskType], Callable]]] = None
+    _flash_kernel_dispatch: Optional[Dict[torch.dtype, Dict[Optional[AttnMaskType], Callable]]] = None
 
     @staticmethod
     def _init_kernels_dispatch():
@@ -105,6 +106,8 @@ def _init_kernels_dispatch():
                 torch.bfloat16: half_dispatch_map,
                 torch.float32: float_dispatch_map,
             }
+        if ColoAttention._flash_kernel_dispatch is None:
+            ColoAttention._flash_kernel_dispatch = FlashAttentionDaoLoader()
 
     @staticmethod
     def _dispatch_kernel(dtype: torch.dtype, mask_type: Optional[AttnMaskType], size) -> Callable:
@@ -118,7 +121,7 @@ def _dispatch_kernel(dtype: torch.dtype, mask_type: Optional[AttnMaskType], size
             )
 
         if size >= MEMORY_BOUND:
-            flash_kernel = FlashAttentionDaoLoader().load()
+            ColoAttention._flash_kernel_dispatch = ColoAttention._flash_kernel_dispatch.load()
         # lazy load
         if isinstance(ColoAttention._kernel_dispatch_map[dtype][mask_type], KernelLoader):
             ColoAttention._kernel_dispatch_map[dtype][mask_type] = ColoAttention._kernel_dispatch_map[dtype][
@@ -126,7 +129,7 @@ def _dispatch_kernel(dtype: torch.dtype, mask_type: Optional[AttnMaskType], size
             ].load()
 
         if size >= MEMORY_BOUND and mask_type in (AttnMaskType.PADDED_CAUSAL, AttnMaskType.CAUSAL):
-            return flash_kernel
+            return ColoAttention._flash_kernel_dispatch
         else:
             return ColoAttention._kernel_dispatch_map[dtype][mask_type]
 

From b5823192738299b602b0876c486ae97964652326 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Fri, 13 Sep 2024 10:24:41 +0000
Subject: [PATCH 086/167] fix

---
 colossalai/shardformer/layer/attn.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 129b04fb796d..7157fbed8163 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -210,6 +210,7 @@ def prepare_attn_kwargs(
                 }
             )
             if is_causal:
+                attention_mask = kv_padding_mask[:, None, :].expand(b, s_q, s_kv).to(dtype=dtype, device=device)
                 outputs["attention_mask_type"] = AttnMaskType.PADDED_CAUSAL
                 if memory_size < MEMORY_BOUND:
                     if s_q != 1:

From f20b066c591fc69bac41757de7d3cfa1e081f688 Mon Sep 17 00:00:00 2001
From: Guangyao Zhang <xjtu521@qq.com>
Date: Sat, 14 Sep 2024 10:40:01 +0800
Subject: [PATCH 087/167] [fp8] Disable all_gather intranode. Disable Redundant
 all_gather fp8 (#6059)

* all_gather only internode, fix pytest

* fix cuda arch <89 compile pytest error

* fix pytest failure

* disable all_gather_into_tensor_flat_fp8

* fix fp8 format

* fix pytest

* fix conversations

* fix chunk tuple to list
---
 colossalai/quantization/fp8.py                | 97 ++++---------------
 colossalai/shardformer/layer/_operation.py    |  4 +-
 colossalai/zero/gemini/chunk/chunk.py         | 12 ++-
 .../low_level/bookkeeping/tensor_bucket.py    |  4 +-
 colossalai/zero/low_level/low_level_optim.py  |  9 +-
 tests/test_fp8/test_fp8_all_to_all.py         |  4 +-
 ...st_fp8_gather.py => test_fp8_allgather.py} | 17 +---
 tests/test_fp8/test_fp8_allgather_flat.py     | 43 --------
 8 files changed, 43 insertions(+), 147 deletions(-)
 rename tests/test_fp8/{test_fp8_gather.py => test_fp8_allgather.py} (81%)
 delete mode 100644 tests/test_fp8/test_fp8_allgather_flat.py

diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index 388bbde052d2..8243a29ac825 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -10,6 +10,10 @@
 
 SUPPORT_TORCH_COMPILE = Version(torch.__version__) >= Version("2.4.0")
 SCALE_BYTES = 4
+try:
+    cuda_arch = int("".join(str(i) for i in torch.cuda.get_device_capability()))
+except:
+    cuda_arch = 0
 
 
 class Handle:
@@ -185,7 +189,7 @@ def all_reduce_fp8(
     return dist.all_reduce(tensor, op=op, group=group, async_op=async_op)
 
 
-@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=False)
+@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=False, disable=cuda_arch < 89)
 def _all_to_all_single_fp8(
     output, input, output_split_sizes=None, input_split_sizes=None, fp8_format="e5m2", group=None, async_op=False
 ) -> Optional[Handle]:
@@ -606,79 +610,7 @@ def split_chunk_by_channel(
     return chunk.split(sizes)
 
 
-def all_gather_into_tensor_flat_fp8(
-    output_tensor: torch.Tensor,
-    input_tensor: torch.Tensor,
-    output_shape: torch.Size,
-    group: dist.ProcessGroup,
-    fp8_format: str = "e4m3",
-    async_op: bool = False,
-) -> Optional[Handle]:
-    """all gather into tensor in fp8 format
-
-    Args:
-        output_tensor (torch.Tensor): output tensor, which is flattened
-        input_tensor (torch.Tensor): input tensor, which is flattened
-        group (dist.ProcessGroup): process group
-        fp8_format (str, optional): fp8 format, e4m3 or e5m2. Defaults to "e4m3".
-    """
-    assert input_tensor.dim() == 1 and output_tensor.dim() == 1, "input/output tensor should be flattened"
-    world_size = dist.get_world_size(group)
-    assert (
-        output_tensor.numel() == input_tensor.numel() * world_size
-    ), "output tensor size should be world_size times of input tensor size"
-
-    input_type = output_tensor.dtype
-
-    fp8_type = torch.float8_e4m3fn if fp8_format == "e4m3" else torch.float8_e5m2
-    fp8_max = torch.finfo(fp8_type).max
-
-    if len(output_shape) == 2:
-        per_channel_max = torch.zeros(output_shape[0], device=output_tensor.device, dtype=torch.float)
-        num_channels, channel_size = output_shape
-        rank = dist.get_rank(group)
-        channel_start_idx = (input_tensor.numel() * rank) // channel_size
-        per_channel_splits = split_chunk_by_channel(input_tensor, channel_size, num_channels, rank, world_size)
-        for i, per_channel_split in enumerate(per_channel_splits):
-            idx = i + channel_start_idx
-            if idx < num_channels:
-                per_channel_max[idx] = per_channel_split.abs().max().float()
-        dist.all_reduce(per_channel_max, op=dist.ReduceOp.MAX, group=group)
-        per_channel_max = torch.where(per_channel_max > 0, per_channel_max, 1.0)
-        scale = fp8_max / per_channel_max
-        fp8_input = input_tensor.float()
-        fp8_per_channel_splits = split_chunk_by_channel(fp8_input, channel_size, num_channels, rank, world_size)
-        for i, per_channel_split in enumerate(fp8_per_channel_splits):
-            idx = i + channel_start_idx
-            if idx < num_channels:
-                per_channel_split.mul_(scale[idx])
-        fp8_input = fp8_input.to(fp8_type)
-    else:
-        per_tensor_max = input_tensor.abs().max().float()
-        dist.all_reduce(per_tensor_max, op=dist.ReduceOp.MAX, group=group)
-        per_tensor_max = torch.where(per_tensor_max > 0, per_tensor_max, 1.0)
-        scale = fp8_max / per_tensor_max
-        fp8_input = (scale * input_tensor.float()).to(fp8_type)
-    scale_inv = 1.0 / scale
-
-    buffer = torch.empty_like(output_tensor, dtype=fp8_type)
-    tensor_handle = dist.all_gather_into_tensor(
-        buffer.view(torch.uint8), fp8_input.view(torch.uint8), group=group, async_op=async_op
-    )
-
-    def cast_op():
-        numel = output_shape.numel()
-        valid_buffer = buffer[:numel].reshape(output_shape)
-        valid_buffer = cast_from_fp8(valid_buffer, scale_inv, input_type, per_channel_scale=(len(output_shape) == 2))
-        output_tensor[:numel].copy_(valid_buffer.view(-1))
-
-    if async_op:
-        return Handle([tensor_handle], cast_op)
-    else:
-        cast_op()
-
-
-@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=False)
+@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=False, disable=cuda_arch < 89)
 def _all_to_all_fp8(output_list, input_list, group=None, fp8_format="e5m2", async_op=False):
     world_size = dist.get_world_size(group)
     input_type = input_list[0].dtype
@@ -718,8 +650,8 @@ def all_to_all_fp8(output_list, input_list, group=None, fp8_format="e5m2", async
         return _all_to_all_fp8(output_list, input_list, group=group, fp8_format=fp8_format, async_op=async_op)
 
 
-def gather_fp8(output_list, input_, group=None, fp8_format="e5m2", async_op: bool = False) -> Optional[Handle]:
-
+@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=False, disable=cuda_arch < 89)
+def _all_gather_fp8(output_list, input_, group=None, fp8_format="e5m2", async_op: bool = False) -> Optional[Handle]:
     world_size = dist.get_world_size(group)
 
     input_type = input_.dtype
@@ -743,8 +675,17 @@ def cast_op():
         cast_op()
 
 
-@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=False)
 def all_gather_fp8(output_list, input_, group=None, fp8_format="e5m2", async_op: bool = False) -> Optional[Handle]:
+    if process_group_is_intranode(group):
+        return dist.all_gather(output_list, input_, group=group, async_op=async_op)
+    else:
+        return _all_gather_fp8(output_list, input_, group=group, fp8_format=fp8_format, async_op=async_op)
+
+
+@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=False, disable=cuda_arch < 89)
+def all_gather_fp8_lagacy(
+    output_list, input_, group=None, fp8_format="e5m2", async_op: bool = False
+) -> Optional[Handle]:
     world_size = dist.get_world_size(group)
     shape = input_.shape
     input_type = input_.dtype
@@ -769,7 +710,7 @@ def all_gather_fp8(output_list, input_, group=None, fp8_format="e5m2", async_op:
     #     out.copy_(output[i].view(shape))
 
 
-@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=False)
+@torch.compile(mode="max-autotune-no-cudagraphs", dynamic=False, disable=cuda_arch < 89)
 def all_gather_fp8_ring(output_list, input_, group=None, fp8_format="e5m2", async_op: bool = False) -> Optional[Handle]:
     world_size = dist.get_world_size(group)
     rank = dist.get_rank(group)
diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index f970d8ccc85d..aec82356747a 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -17,10 +17,10 @@
     _grad_accum_fusion_available = False
 
 from colossalai.quantization.fp8 import (
+    all_gather_fp8,
     all_reduce_fp8,
     all_to_all_fp8,
     all_to_all_single_fp8,
-    gather_fp8,
     reduce_scatter_fp8,
 )
 
@@ -961,7 +961,7 @@ def _gather(input_, dim=-1, process_group=None, fp8_communication=False, fp8_for
     input_ = input_.contiguous()
     tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
     if fp8_communication:
-        gather_fp8(tensor_list, input_, fp8_format=fp8_format, group=process_group)
+        all_gather_fp8(tensor_list, input_, fp8_format=fp8_format, group=process_group)
     else:
         dist.all_gather(tensor_list, input_, group=process_group)
 
diff --git a/colossalai/zero/gemini/chunk/chunk.py b/colossalai/zero/gemini/chunk/chunk.py
index e2b7a8f56432..351ff14e0131 100644
--- a/colossalai/zero/gemini/chunk/chunk.py
+++ b/colossalai/zero/gemini/chunk/chunk.py
@@ -7,6 +7,7 @@
 from torch.distributed import ProcessGroup
 
 from colossalai.accelerator import get_accelerator
+from colossalai.quantization.fp8 import all_gather_fp8
 
 
 class TensorState(Enum):
@@ -523,11 +524,12 @@ def __gather(self, async_op: bool = False) -> Optional[dist.Work]:
             alloc_storage(self.cuda_global_chunk)
             assert self.cuda_global_chunk.is_contiguous()
             if self.fp8_communication:
-                assert async_op == False, "fp8 all-gather does not support async_op!"
-                from colossalai.quantization.fp8 import all_gather_into_tensor_flat_fp8
-
-                work = all_gather_into_tensor_flat_fp8(
-                    self.cuda_global_chunk, self.cuda_shard, self.cuda_global_chunk.shape, self.torch_pg
+                work = all_gather_fp8(
+                    list(self.cuda_global_chunk.chunk(self.pg_size)),
+                    self.cuda_shard,
+                    self.torch_pg,
+                    fp8_format="e4m3",
+                    async_op=async_op,
                 )
             else:
                 work = dist.all_gather_into_tensor(
diff --git a/colossalai/zero/low_level/bookkeeping/tensor_bucket.py b/colossalai/zero/low_level/bookkeeping/tensor_bucket.py
index d5fd2fe51662..3c95aa6babcd 100644
--- a/colossalai/zero/low_level/bookkeeping/tensor_bucket.py
+++ b/colossalai/zero/low_level/bookkeeping/tensor_bucket.py
@@ -4,7 +4,7 @@
 import torch.distributed as dist
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 
-from colossalai.quantization.fp8 import all_gather_into_tensor_flat_fp8
+from colossalai.quantization.fp8 import all_gather_fp8
 
 
 class TensorBucket:
@@ -67,7 +67,7 @@ def all_gather(self, group=None, fp8_communication: bool = False):
         flat = self.flatten()
         buffer = torch.empty(flat.numel() * dist.get_world_size(group), device=flat.device, dtype=flat.dtype)
         if fp8_communication:
-            all_gather_into_tensor_flat_fp8(buffer, flat, output_shape=buffer.shape, group=group)
+            all_gather_fp8(list(buffer.chunk(dist.get_world_size(group))), flat, group=group, fp8_format="e4m3")
         else:
             dist.all_gather_into_tensor(buffer, flat, group=group)
         unflat_buffers = [self.unflatten(buffer) for buffer in buffer.chunk(dist.get_world_size(group))]
diff --git a/colossalai/zero/low_level/low_level_optim.py b/colossalai/zero/low_level/low_level_optim.py
index ed51c2bacafc..c019ff19b84c 100644
--- a/colossalai/zero/low_level/low_level_optim.py
+++ b/colossalai/zero/low_level/low_level_optim.py
@@ -20,7 +20,7 @@
 )
 from colossalai.interface import OptimizerWrapper
 from colossalai.logging import get_dist_logger
-from colossalai.quantization.fp8 import all_gather_into_tensor_flat_fp8, all_reduce_fp8, reduce_scatter_fp8
+from colossalai.quantization.fp8 import all_gather_fp8, all_reduce_fp8, reduce_scatter_fp8
 from colossalai.tensor.moe_tensor.api import is_moe_tensor
 
 from ._utils import calculate_global_norm_from_list, has_inf_or_nan, release_param_grad, sync_tensor
@@ -580,8 +580,11 @@ def step(self, closure=None):
                 else:
                     if param_to_gather.numel() > self.pg_to_tensor_bucket[pg].max_size:
                         if self._fp8_communication:
-                            all_gather_into_tensor_flat_fp8(
-                                padded_working_param, param_to_gather, pg, fp8_format="e4m3"
+                            all_gather_fp8(
+                                list(padded_working_param.chunk(dist.get_world_size(pg))),
+                                param_to_gather,
+                                pg,
+                                fp8_format="e4m3",
                             )
                         else:
                             dist.all_gather_into_tensor(padded_working_param, param_to_gather, pg)
diff --git a/tests/test_fp8/test_fp8_all_to_all.py b/tests/test_fp8/test_fp8_all_to_all.py
index 884aab744ba0..98bbbad8550d 100644
--- a/tests/test_fp8/test_fp8_all_to_all.py
+++ b/tests/test_fp8/test_fp8_all_to_all.py
@@ -5,7 +5,7 @@
 
 from colossalai import launch
 from colossalai.accelerator import get_accelerator
-from colossalai.quantization.fp8 import all_to_all_fp8
+from colossalai.quantization.fp8 import _all_to_all_fp8
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
 
 
@@ -20,7 +20,7 @@ def check_4gpu(shape, scatter_dim, dtype, fp8_format):
     input_tensor_list = [x.contiguous() for x in input_tensor_list]
     output_tensor_list_fp8 = [torch.empty_like(x) for x in input_tensor_list]
     output_tensor_list = [torch.empty_like(x) for x in input_tensor_list]
-    all_to_all_fp8(output_tensor_list_fp8, input_tensor_list, group=_get_default_group(), fp8_format=fp8_format)
+    _all_to_all_fp8(output_tensor_list_fp8, input_tensor_list, group=_get_default_group(), fp8_format=fp8_format)
     dist.all_to_all(output_tensor_list, input_tensor_list, group=_get_default_group())
     assert_close(output_tensor_list_fp8, output_tensor_list, rtol=0.1, atol=0.1)
 
diff --git a/tests/test_fp8/test_fp8_gather.py b/tests/test_fp8/test_fp8_allgather.py
similarity index 81%
rename from tests/test_fp8/test_fp8_gather.py
rename to tests/test_fp8/test_fp8_allgather.py
index 40c2ccb9a17b..91e66e83c67b 100644
--- a/tests/test_fp8/test_fp8_gather.py
+++ b/tests/test_fp8/test_fp8_allgather.py
@@ -5,22 +5,13 @@
 
 from colossalai import launch
 from colossalai.accelerator import get_accelerator
-from colossalai.quantization.fp8 import gather_fp8
+from colossalai.quantization.fp8 import _all_gather_fp8
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
 
 
 @parameterize(
     "shape",
-    [
-        (3, 7),
-        (2, 1),
-        (1, 2),
-        (2, 2),
-        (4, 2),
-        (5,),
-        (4,),
-        (2,),
-    ],
+    [(3, 7, 16)],
 )
 @parameterize("dtype", [torch.bfloat16, torch.float16])
 @parameterize("fp8_format", ["e4m3", "e5m2"])
@@ -30,7 +21,9 @@ def check_4gpu(shape, dtype, fp8_format, async_op):
     x = torch.rand(shape, dtype=dtype, device=get_accelerator().get_current_device())
     output_list = [torch.empty_like(x) for _ in range(world_size)]
     output_list_fp8 = [torch.empty_like(x) for _ in range(world_size)]
-    fp8_handle = gather_fp8(output_list_fp8, x, group=_get_default_group(), fp8_format=fp8_format, async_op=async_op)
+    fp8_handle = _all_gather_fp8(
+        output_list_fp8, x, group=_get_default_group(), fp8_format=fp8_format, async_op=async_op
+    )
     origin_hanle = dist.all_gather(output_list, x, group=_get_default_group(), async_op=async_op)
     if async_op:
         fp8_handle.wait()
diff --git a/tests/test_fp8/test_fp8_allgather_flat.py b/tests/test_fp8/test_fp8_allgather_flat.py
deleted file mode 100644
index 2d43e5bd5902..000000000000
--- a/tests/test_fp8/test_fp8_allgather_flat.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import torch
-import torch.distributed as dist
-import torch.nn.functional as F
-from torch.distributed.distributed_c10d import _get_default_group
-from torch.testing import assert_close
-
-from colossalai import launch
-from colossalai.accelerator import get_accelerator
-from colossalai.quantization.fp8 import all_gather_into_tensor_flat_fp8
-from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
-
-
-@parameterize("shape", [(3, 7), (2, 1), (1, 2), (2, 2), (4, 2), (5,), (4,), (2,)])
-@parameterize("dtype", [torch.bfloat16, torch.float16])
-@parameterize("async_op", [True, False])
-def check_4gpu(shape, dtype, async_op):
-    world_size = dist.get_world_size()
-    rank = dist.get_rank()
-    x = torch.rand(shape, dtype=dtype, device=get_accelerator().get_current_device())
-    flat_padded_x = x.view(-1)
-    if flat_padded_x.size(0) % world_size != 0:
-        pad_size = world_size - flat_padded_x.size(0) % world_size
-        flat_padded_x = F.pad(flat_padded_x, (0, pad_size))
-    output = torch.empty_like(flat_padded_x)
-    chunk = flat_padded_x.chunk(world_size)[rank].clone()
-    handle = all_gather_into_tensor_flat_fp8(output, chunk, x.shape, group=_get_default_group(), async_op=async_op)
-    if async_op:
-        handle.wait()
-    assert_close(output[: x.numel()], x.view(-1), rtol=0.1, atol=0.1)
-
-
-def run_dist(rank, world_size, port):
-    launch(rank=rank, world_size=world_size, port=port, host="localhost")
-    check_4gpu()
-
-
-@rerun_if_address_is_in_use()
-def test_all_gather_flat():
-    spawn(run_dist, 4)
-
-
-if __name__ == "__main__":
-    test_all_gather_flat()

From bdb125f83fb079e82963aeca6f50a23493a2ac6b Mon Sep 17 00:00:00 2001
From: Guangyao Zhang <xjtu521@qq.com>
Date: Sat, 14 Sep 2024 11:01:05 +0800
Subject: [PATCH 088/167] [doc] FP8 training and communication document (#6050)

* Add FP8 training and communication document

* add fp8 docstring for plugins

* fix typo

* fix typo
---
 colossalai/booster/plugin/gemini_plugin.py         |  2 ++
 .../booster/plugin/hybrid_parallel_plugin.py       |  3 ++-
 colossalai/booster/plugin/low_level_zero_plugin.py |  2 ++
 .../booster/plugin/moe_hybrid_parallel_plugin.py   |  4 +++-
 colossalai/booster/plugin/torch_ddp_plugin.py      |  1 +
 .../mixed_precision_training_with_booster.md       | 12 ++++++++++--
 .../mixed_precision_training_with_booster.md       | 14 +++++++++++---
 7 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py
index 6a5d0c1613df..ae49aa8b148d 100644
--- a/colossalai/booster/plugin/gemini_plugin.py
+++ b/colossalai/booster/plugin/gemini_plugin.py
@@ -323,7 +323,9 @@ class GeminiPlugin(DPPluginBase):
         enable_jit_fused (bool, optional): Whether to switch on JIT in Shardformer. Default to False.
         enable_sequence_parallelism (bool): Whether to turn on sequence parallelism in Shardformer. Defaults to False.
         enable_sequence_overlap (bool): Whether to turn on sequence overlap in Shardformer. Defaults to False.
+        use_fp8 (bool, optional): Whether to enable fp8 mixed precision training. Defaults to False.
         verbose (bool, optional): verbose mode. Debug info including chunk search result will be printed. Defaults to False.
+        fp8_communication (bool, optional): Whether to enable fp8 communication. Defaults to False.
     """
 
     def __init__(
diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index 8e972d0146da..bb663f6a6d5e 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -981,7 +981,8 @@ class HybridParallelPlugin(PipelinePluginBase):
         gradient_checkpoint_config (GradientCheckpointConfig, optional): Configuration for gradient checkpointing. Defaults to None.
         enable_metadata_cache (bool, optional): Whether to enable metadata cache for pipeline parallelism. Defaults to True.
         make_vocab_size_divisible_by (int, optional): it's used when padding the vocabulary size, to make it choose an faster kenel. Default to 64.
-        fp8_communication (bool, optional): Whether to enable fp8 communication in model parallelism
+        fp8_communication (bool, optional): Whether to enable fp8 communication. Defaults to False.
+        use_fp8 (bool, optional): Whether to enable fp8 mixed precision training. Defaults to False.
         overlap_p2p (bool, optional): Whether to overlap the p2p communication in pipeline parallelism
         inner_ring_size (int, optional): The inner ring size of 2D Ring Attention when sp mode is "ring_attn".
             It's advisable to not tune this (especially in single-node settings) and let it be heuristically set based on topology by default.
diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py
index cec15dd5dd34..b167b5c7a59e 100644
--- a/colossalai/booster/plugin/low_level_zero_plugin.py
+++ b/colossalai/booster/plugin/low_level_zero_plugin.py
@@ -327,6 +327,8 @@ class LowLevelZeroPlugin(DPPluginBase):
         overlap_communication (bool, optional): whether to overlap communication and computation. Defaults to True.
         cpu_offload (bool, optional): whether to offload grad, master weight and optimizer state to cpu. Defaults to False.
         verbose (bool, optional): verbose mode. Debug info including grad overflow will be printed. Defaults to False.
+        use_fp8 (bool, optional): Whether to enable fp8 mixed precision training. Defaults to False.
+        fp8_communication (bool, optional): Whether to enable fp8 communication. Defaults to False.
     """
 
     def __init__(
diff --git a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
index 2324a5239d79..0807b374901a 100644
--- a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
@@ -170,7 +170,9 @@ class MoeHybridParallelPlugin(HybridParallelPlugin):
         gradient_checkpoint_config (GradientCheckpointConfig, optional): Configuration for gradient checkpointing. Defaults to None.
         enable_metadata_cache (bool, optional): Whether to enable metadata cache for pipeline parallelism. Defaults to True.
         make_vocab_size_divisible_by (int, optional): it's used when padding the vocabulary size, to make it choose an faster kenel. Default to 64.
-        overlap_p2p (bool, optional): Whether to overlap the p2p communication in pipeline parallelism
+        overlap_p2p (bool, optional): Whether to overlap the p2p communication in pipeline parallelism.
+        use_fp8 (bool, optional): Whether to enable fp8 mixed precision training. Defaults to False.
+        fp8_communication (bool, optional): Whether to enable fp8 communication. Defaults to False.
     """
 
     def __init__(
diff --git a/colossalai/booster/plugin/torch_ddp_plugin.py b/colossalai/booster/plugin/torch_ddp_plugin.py
index 61d785a4c9e7..ec7ce7f9aae4 100644
--- a/colossalai/booster/plugin/torch_ddp_plugin.py
+++ b/colossalai/booster/plugin/torch_ddp_plugin.py
@@ -169,6 +169,7 @@ class TorchDDPPlugin(DPPluginBase):
         check_reduction (bool, optional): Whether to check reduction. Defaults to False.
         gradient_as_bucket_view (bool, optional): Whether to use gradient as bucket view. Defaults to False.
         static_graph (bool, optional): Whether to use static graph. Defaults to False.
+        fp8_communication (bool, optional): Whether to enable fp8 communication. Defaults to False.
     """
 
     def __init__(
diff --git a/docs/source/en/features/mixed_precision_training_with_booster.md b/docs/source/en/features/mixed_precision_training_with_booster.md
index baaaacdddf9e..65304b1f4e65 100644
--- a/docs/source/en/features/mixed_precision_training_with_booster.md
+++ b/docs/source/en/features/mixed_precision_training_with_booster.md
@@ -9,6 +9,7 @@ Author: [Mingyan Jiang](https://github.com/jiangmingyan)
 **Related Paper**
 
 - [Accelerating Scientific Computations with Mixed Precision Algorithms](https://arxiv.org/abs/0808.2794)
+- [FP8 Formats for Deep Learning](https://arxiv.org/pdf/2209.05433)
 
 ## Introduction
 
@@ -60,7 +61,11 @@ However, there are other operations, like reductions, which require the dynamic
 
 ## AMP in Colossal-AI
 
-We supported three AMP training methods and allowed the user to train with AMP with no code. If you want to train with amp, just assign `mixed_precision` with `fp16` when you instantiate the `Booster`. Next we will support `bf16`, `fp8`.
+We supported three AMP training methods and allowed the user to train with AMP with no code. If you want to train with amp, just assign `mixed_precision` with `fp16` when you instantiate the `Booster`. Next we will support `bf16`.
+
+Currently we only support `fp8` mixed precision training for the `Linear` layer. Please specify the `use_fp8` parameter when create the plugin object.
+
+To reduce the communication volume inter nodes in low-bandwidth scenarios, we support FP8 communication compression. Please specify the `fp8_communication` parameter when create the  plugin object.
 
 ### Start with Booster
 
@@ -74,7 +79,6 @@ instantiate `Booster` with `mixed_precision="fp16"`, then you can train with tor
     'fp16': torch amp
     'fp16_apex': apex amp,
     'bf16': bf16,
-    'fp8': fp8,
     'fp16_naive': naive amp
 """
 from colossalai import Booster
@@ -128,6 +132,10 @@ The output model is converted to AMP model of smaller memory consumption.
 If your input model is already too large to fit in a GPU, please instantiate your model weights in `dtype=torch.float16`.
 Otherwise, try smaller models or checkout more parallelization training techniques!
 
+### FP8 Communication
+
+In low-bandwidth scenarios, to reduce the communication load multiple nodes, we support FP8 communication compression, which can be enabled by using `fp8_communication=True` when you when create the plugin object (such as `GeminiPlugin`). The all-to-all, all-gather and P2P operations inter nodes will use FP8 format for data transmission. Currently the FP8 communication of reduction operators such as all-reduce and reduce-scatter is currently not supported due to lack of support of the NCCL library.
+
 ## Hands-on Practice
 
 Now we will introduce the use of AMP with Colossal-AI. In this practice, we will use Torch AMP as an example.
diff --git a/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md b/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
index 53d9013db296..da377ceb294b 100644
--- a/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
+++ b/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
@@ -9,6 +9,7 @@
 **相关论文**
 
 - [Accelerating Scientific Computations with Mixed Precision Algorithms](https://arxiv.org/abs/0808.2794)
+- [FP8 Formats for Deep Learning](https://arxiv.org/pdf/2209.05433)
 
 ## 引言
 
@@ -56,9 +57,13 @@ AMP 代表自动混合精度训练。
 
 ## Colossal-AI 中的 AMP
 
-我们支持三种 AMP 训练方法，并允许用户在没有改变代码的情况下使用 AMP 进行训练。booster 支持 amp 特性注入，如果您要使用混合精度训练，则在创建 booster 实例时指定`mixed_precision`参数;后续将会拓展`bf16`,`pf8`的混合精度训练.
+我们支持三种 AMP 训练方法，并允许用户在没有改变代码的情况下使用 AMP 进行训练。booster 支持 amp 特性注入，如果您要使用混合精度训练，则在创建 booster 实例时指定`mixed_precision`参数; 后续将会拓展`bf16`.
 
-#### booster 启动方式
+我们目前只支持`Linear`层的`fp8`混合精度训练，如果您需要使用，请在创建 plugin实例时指定`use_fp8`参数。
+
+为了减少低带宽场景下多机之间的通讯负载，我们还支持了FP8通讯。如果您需要使用，请在创建 plugin实例时指定`fp8_communication`参数。
+
+### booster 启动方式
 
 您可以在创建 booster 实例时，指定`mixed_precision="fp16"`即使用 torch amp。
 
@@ -70,7 +75,6 @@ AMP 代表自动混合精度训练。
     'fp16': torch amp
     'fp16_apex': apex amp,
     'bf16': bf16,
-    'fp8': fp8,
     'fp16_naive': naive amp
 """
 from colossalai import Booster
@@ -118,6 +122,10 @@ booster = Booster(mixed_precision=mixed_precision,...)
 
 当使用`colossalai.booster`时, 首先需要实例化一个模型、一个优化器和一个标准。将输出模型转换为内存消耗较小的 AMP 模型。如果您的输入模型已经太大，无法放置在 GPU 中，请使用`dtype=torch.float16`实例化你的模型。或者请尝试更小的模型，或尝试更多的并行化训练技术！
 
+### FP8通讯
+
+在低带宽场景下，为了减少多机间的通讯负载，我们支持使用FP8的形式对通讯进行压缩，可以在初始化plugin实例（如`GeminiPlugin`）时使用fp8_communication=True来启用。此时多机之间all-to-all, all-gather以及P2P操作将使用FP8的格式进行数据传输。受限于NCCL库的支持，目前不支持缩减(Reduction)算子如Allreduce, ReduceScatter的FP8通讯。
+
 ## 实例
 
 下面我们将展现如何在 Colossal-AI 使用 AMP。在该例程中，我们使用 Torch AMP.

From 827ef3ee9a176de774422f7361cc95efae57e3f1 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Sat, 14 Sep 2024 10:40:35 +0000
Subject: [PATCH 089/167] fix

---
 colossalai/shardformer/layer/attn.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 7157fbed8163..2f8e4d677c54 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -195,10 +195,6 @@ def prepare_attn_kwargs(
                 b,
                 s_kv,
             ), f"Padding mask shape {kv_padding_mask.shape} should align with shape 4d ({b}, {s_kv})"
-            if memory_size < MEMORY_BOUND and not is_causal:
-                attention_mask = kv_padding_mask[:, None, :].expand(b, s_q, s_kv).to(dtype=dtype, device=device)
-            else:
-                attention_mask = torch.empty((0,), dtype=dtype, device=device)
             outputs.update(
                 {
                     "cu_seqlens_q": cu_seqlens_q,
@@ -210,15 +206,18 @@ def prepare_attn_kwargs(
                 }
             )
             if is_causal:
-                attention_mask = kv_padding_mask[:, None, :].expand(b, s_q, s_kv).to(dtype=dtype, device=device)
                 outputs["attention_mask_type"] = AttnMaskType.PADDED_CAUSAL
                 if memory_size < MEMORY_BOUND:
                     if s_q != 1:
+                        attention_mask = kv_padding_mask[:, None, :].expand(b, s_q, s_kv).to(dtype=dtype, device=device)
                         attention_mask = attention_mask * attention_mask.new_ones(s_q, s_kv).tril(diagonal=0)
                 else:
                     attention_mask = torch.empty((0,), dtype=dtype, device=device)
             else:
                 outputs["attention_mask_type"] = AttnMaskType.PADDED
+                if memory_size < MEMORY_BOUND:
+                    attention_mask = kv_padding_mask[:, None, :].expand(b, s_q, s_kv).to(dtype=dtype, device=device)
+
         if invert:
             attention_mask = invert_mask(attention_mask).unsqueeze(1)
         outputs["attention_mask"] = attention_mask

From 10e4f7da724d3e45135d4544678793ecd3e74029 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Mon, 16 Sep 2024 13:45:04 +0800
Subject: [PATCH 090/167] fix

---
 colossalai/shardformer/layer/attn.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 2f8e4d677c54..5f0e9261c0de 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -121,7 +121,8 @@ def _dispatch_kernel(dtype: torch.dtype, mask_type: Optional[AttnMaskType], size
             )
 
         if size >= MEMORY_BOUND:
-            ColoAttention._flash_kernel_dispatch = ColoAttention._flash_kernel_dispatch.load()
+            if isinstance(ColoAttention._flash_kernel_dispatch, KernelLoader):
+                ColoAttention._flash_kernel_dispatch = ColoAttention._flash_kernel_dispatch.load()
         # lazy load
         if isinstance(ColoAttention._kernel_dispatch_map[dtype][mask_type], KernelLoader):
             ColoAttention._kernel_dispatch_map[dtype][mask_type] = ColoAttention._kernel_dispatch_map[dtype][

From 4fa6b9509c7dfe44c4e99188a811255a848f8dbf Mon Sep 17 00:00:00 2001
From: botbw <wang1570@e.ntu.edu.sg>
Date: Wed, 18 Sep 2024 10:09:01 +0800
Subject: [PATCH 091/167] [moe] add parallel strategy for shared_expert && fix
 test for deepseek (#6063)

---
 colossalai/shardformer/modeling/deepseek.py         | 13 +++++++++++++
 .../test_model/test_shard_deepseek.py               | 10 ++++++----
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/colossalai/shardformer/modeling/deepseek.py b/colossalai/shardformer/modeling/deepseek.py
index 4b1b82b7c770..7bcdf6fc9892 100644
--- a/colossalai/shardformer/modeling/deepseek.py
+++ b/colossalai/shardformer/modeling/deepseek.py
@@ -109,6 +109,19 @@ def setup_process_groups(
         for p in self.experts.parameters():
             set_moe_tensor_ep_group(p, ep_group)
 
+        if self.config.n_shared_experts is not None:
+            self.shared_experts.gate_proj = Linear1D_Col.from_native_module(
+                self.shared_experts.gate_proj, self.tp_group, fp8_communication=self.fp8_communication
+            )
+
+            self.shared_experts.up_proj = Linear1D_Col.from_native_module(
+                self.shared_experts.up_proj, self.tp_group, fp8_communication=self.fp8_communication
+            )
+
+            self.shared_experts.down_proj = Linear1D_Row.from_native_module(
+                self.shared_experts.down_proj, self.tp_group, fp8_communication=self.fp8_communication
+            )
+
     @staticmethod
     def from_native_module(
         module,
diff --git a/tests/test_shardformer/test_model/test_shard_deepseek.py b/tests/test_shardformer/test_model/test_shard_deepseek.py
index d782a2a09604..4b92dbdee4bf 100644
--- a/tests/test_shardformer/test_model/test_shard_deepseek.py
+++ b/tests/test_shardformer/test_model/test_shard_deepseek.py
@@ -20,14 +20,15 @@
 NUM_BATCH = 8
 NUM_TOK_PER_BATCH, NUM_EXPERTS = 64, 4
 NUM_LAYERS = 4
-HIDDEN_SIZE_PER_HEAD = 4
+HIDDEN_SIZE_PER_HEAD = 8
 NUM_HEADS = 8
 TOP_K = 2
 
 
-def run_deepseek_commom(config: Tuple[int, ...]):
+def run_deepseek_commom(parallel_config: Tuple[int, ...]):
     Randomizer.reset_index()
-    stage, ep_size, pp_size, tp_size, sp_size = config
+    print(f"rank {dist.get_rank()} testing {parallel_config}")
+    stage, ep_size, pp_size, tp_size, sp_size = parallel_config
     world_size = dist.get_world_size()
     rank = dist.get_rank()
     dtype, precision = torch.bfloat16, "bf16"
@@ -65,6 +66,7 @@ def run_deepseek_commom(config: Tuple[int, ...]):
         attn_implementation="flash_attention_2",
         torch_dtype="float16",
         n_routed_experts=NUM_EXPERTS,
+        n_shared_experts=2,
         num_experts_per_tok=TOP_K,
         trust_remote_code=True,
     )
@@ -159,7 +161,7 @@ def run_deepseek_commom(config: Tuple[int, ...]):
     if rank == world_size - 1:
         shutil.rmtree(model_dir)
 
-    print(f"rank {dist.get_rank()} test passed")
+    print(f"rank {dist.get_rank()} passed {parallel_config}")
 
 
 @parameterize(

From f9546ba0bee31cbc00a5266b2d9efbacad8fafa8 Mon Sep 17 00:00:00 2001
From: Camille Zhong <44392324+Camille7777@users.noreply.github.com>
Date: Wed, 18 Sep 2024 17:09:45 +0800
Subject: [PATCH 092/167] [ColossalEval] support for vllm (#6056)

* support vllm

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* modify vllm and update readme

* run pre-commit

* remove dupilicated lines and refine code

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update param name

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* refine code

* update readme

* refine code

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 applications/ColossalEval/README.md           |  49 +-
 .../colossal_eval/dataset/agieval.py          |   2 +-
 .../colossal_eval/dataset/ceval.py            |   2 +-
 .../colossal_eval/dataset/cmmlu.py            |   2 +-
 .../colossal_eval/dataset/colossalai.py       |   2 +-
 .../colossal_eval/dataset/cvalues.py          |   2 +-
 .../colossal_eval/dataset/gaokaobench.py      |   2 +-
 .../ColossalEval/colossal_eval/dataset/gsm.py |   4 +-
 .../colossal_eval/dataset/longbench.py        |   2 +-
 .../colossal_eval/dataset/mmlu.py             |   2 +-
 .../colossal_eval/dataset/mtbench.py          |   2 +-
 .../colossal_eval/dataset/safetybench_en.py   |   2 +-
 .../colossal_eval/dataset/safetybench_zh.py   |   2 +-
 .../colossal_eval/models/__init__.py          |   3 +-
 .../colossal_eval/models/chatglm.py           |   4 +-
 .../colossal_eval/models/huggingface.py       |  28 +-
 .../ColossalEval/colossal_eval/models/vllm.py | 498 ++++++++++++++++++
 .../examples/dataset_evaluation/inference.py  |   2 +-
 applications/ColossalEval/requirements.txt    |   1 +
 19 files changed, 576 insertions(+), 35 deletions(-)
 create mode 100644 applications/ColossalEval/colossal_eval/models/vllm.py

diff --git a/applications/ColossalEval/README.md b/applications/ColossalEval/README.md
index 890b1fed3912..bc5394a69a44 100644
--- a/applications/ColossalEval/README.md
+++ b/applications/ColossalEval/README.md
@@ -154,7 +154,7 @@ inference_kwargs = {
     "calculate_loss": True,
     "all_classes": ["A", "B", "C", "D"],
     "language": "Chinese",
-    "pretrain": False,
+    "calculate_overall_loss": False,
     "max_new_tokens": 32
 }
 ```
@@ -163,7 +163,7 @@ The `inference_kwargs` currently contains 5 fields:
 - `calculate_loss` (bool, compulsory): Whether the loss on target tokens will be calculated
 - `all_classes` (Optional[list], compulsory): Whether the subcategory is a single-choice question. Specify all available options in a list or otherwise None.
 - `language` (str, compulsory): The language for the subcategory.
-- `pretrain` (bool, compulsory): Whether the dataset is a pretrain dataset or not. It is usually used for calculate perplexity when you want to evaluate a model with extended context length.
+- `calculate_overall_loss` (bool, compulsory): Whether to calculate the overall loss of sentences or not if the dataset is a pretrain dataset. It is usually used for calculate perplexity when you want to evaluate a model with extended context length.
 - `max_new_tokens` (int, compulsory): The number of new tokens to generate during inference.
 
 For example, for dataset MMLU, each subcategory consists of single-choice questions with options A, B, C and D by default and we can assign value `["A", "B", "C", "D"]` to key`all_classes`. For dataset C-Eval, target answers aren't provided in the test split so `calculate_loss` should be set as False. However, other dataset such as GAOKAO-bench contains different formats of questions and lacks some keys or metadata which can reveal what type (single-choice or multi-choice) of questions it is. Before assigning inference arguments, we first parse the dataset to decide which type of questions the subcategory belongs to and set the inference arguments accordingly.
@@ -230,7 +230,7 @@ Example:
 In this step, you will configure your tokenizer and model arguments to infer on the given datasets.
 
 A config file consists of two parts.
-1. Model config. In model config, you need to specify model name, model path, model class, tokenizer arguments and model arguments. For model class, currently we support `HuggingFaceModel`, `HuggingFaceCausalLM`, `ChatGLMModel` and `ChatGLMModel2`. `HuggingFaceModel` is for models that can be loaded with `AutoModel` and `HuggingFaceCausalLM` is for models that can be loaded with `AutoModelForCausalLM`. `ChatGLMModel` and `ChatGLMModel2` are for ChatGLM and ChatGLM2 models respectively. You can check all model classes in `colossal_eval/models/__init__.py`. If your model should set `trust_remote_code` as true, specify it in the `tokenizer_kwargs` and `model_kwargs` fields.
+1. Model config. In model config, you need to specify model name, model path, model class, tokenizer arguments and model arguments. For model class, currently we support `HuggingFaceModel`, `HuggingFaceCausalLM`, `ChatGLMModel`, `ChatGLMModel2` and `vLLMModel`. `HuggingFaceModel` is for models that can be loaded with `AutoModel` and `HuggingFaceCausalLM` is for models that can be loaded with `AutoModelForCausalLM`. `ChatGLMModel` and `ChatGLMModel2` are for ChatGLM and ChatGLM2 models respectively. `vLLMModel` is for models that can be loaded with vllm offline inference `LLM` class. You can check all model classes in `colossal_eval/models/__init__.py`. If your model should set `trust_remote_code` as true, specify it in the `tokenizer_kwargs` and `model_kwargs` fields.
 2. Dataset config. In dataset config, you need to specify dataset name, path and dataset class. Currently, we support zero-shot on dataset MMLU, CMMLU, AGIEval, GAOKAO-Bench, GSM8K and LongBench and few-shot on dataset MMLU, CMMLU AGIEval and GSM8K. If you want to enable few shot, set `few_shot` as true. You can check all model classes in `colossal_eval/dataset/__init__.py`.
 
 Once you have all config ready, the program will run inference on all the given datasets on all the given models.
@@ -272,7 +272,42 @@ An example config using model class `HuggingFaceCausalLM` and dataset class `CMM
 }
 ```
 
-Currently, we support Hugging Face models. The `tokenizer_kwargs` is the arguments used in `AutoTokenizer.from_pretrained()`. The `model_kwargs` is the arguments used in `AutoModel.from_pretrained` or `AutoModelForCausalLM.from_pretrained()`. `few_shot` will be set true if you want to enable few-shot prompting for the dataset. `debug` will be set true if you want to verify whether your prompt is right or wrong.
+An example config using model class `vLLMModel` and dataset class `CMMLUDataset` can be:
+```json
+{
+    "model": [
+        {
+            "name": "model name",
+            "model_class": "vLLMModel",
+            "parameters": {
+                "path": "path to model",
+                "model_max_length": 2048,
+                "tokenizer_path": "",
+                "tokenizer_kwargs": {
+                    "trust_remote_code": true
+                },
+                "model_kwargs": {
+                    "trust_remote_code": true
+                },
+                "prompt_template": "plain",
+                "batch_size": 4
+            }
+        }
+    ],
+    "dataset": [
+        {
+            "name": "dataset name",
+            "dataset_class": "CMMLUDataset",
+            "debug": false,
+            "few_shot": true,
+            "path": "path to original dataset",
+            "save_path": "path to save converted dataset"
+        }
+    ]
+}
+```
+
+Currently, we support Hugging Face models as well as vLLM models. For Hugging Face models, the `tokenizer_kwargs` is the arguments used in `AutoTokenizer.from_pretrained()`. The `model_kwargs` is the arguments used in `AutoModel.from_pretrained` or `AutoModelForCausalLM.from_pretrained()`. For vLLM model, the `tokenizer_kwargs` and `model_kwargs` are loaded together in `LLM` class.`few_shot` will be set true if you want to enable few-shot prompting for the dataset. `debug` will be set true if you want to verify whether your prompt is right or wrong.
 
 > For GSM8K dataset, you can set additional flags `load_train` or `load_reference` for dataset configuration as true and during the inference process, the program will calculate loss summation over all tokens for each data sample. During the evaluation process, you can use metric `loss_over_all_tokens` to calculate the overall loss and use it for data leakage evaluation.
 
@@ -287,7 +322,7 @@ torchrun --nproc_per_node=4 inference.py \
     --inference_save_path "path to save inference results"
 ```
 
-You should specify the path to config file in `config`. You can run the script without specifying `load_dataset` if you already save the converted dataset or otherwise set it to first load the original dataset and save the converted dataset. You should specify the path to save inference results in `inference_save_path`. If you want to use tensor parallel inference, specify the tensor parallel size in `--tp_size` and the process will automatically calculate  data parallel size.
+You should specify the path to config file in `config`. You can run the script without specifying `load_dataset` if you already save the converted dataset or otherwise set it to first load the original dataset and save the converted dataset. You should specify the path to save inference results in `inference_save_path`. If you want to use tensor parallel inference, specify the tensor parallel size in `--tp_size` and the process will automatically calculate data parallel size (currently not support for `vLLMModel`).
 
 ### Evaluation
 
@@ -530,10 +565,6 @@ class CustomizedModel(BaseModel):
 
 Once you have successfully added your own model, you can specify your model class in your inference config.
 
-## To do
-
-- [ ] Add visualization code for evaluation results on public dataset
-- [ ] Improve the way to label target tokens
 
 ## Citations
 
diff --git a/applications/ColossalEval/colossal_eval/dataset/agieval.py b/applications/ColossalEval/colossal_eval/dataset/agieval.py
index c1cfe37d7599..07597048d7f9 100644
--- a/applications/ColossalEval/colossal_eval/dataset/agieval.py
+++ b/applications/ColossalEval/colossal_eval/dataset/agieval.py
@@ -47,7 +47,7 @@
     "calculate_loss": True,
     "all_classes": None,
     "language": "Chinese",
-    "pretrain": False,
+    "calculate_overall_loss": False,
     "max_new_tokens": 32,
 }
 
diff --git a/applications/ColossalEval/colossal_eval/dataset/ceval.py b/applications/ColossalEval/colossal_eval/dataset/ceval.py
index 1023d1e23c1f..b15dd93afc87 100644
--- a/applications/ColossalEval/colossal_eval/dataset/ceval.py
+++ b/applications/ColossalEval/colossal_eval/dataset/ceval.py
@@ -70,7 +70,7 @@
     "calculate_loss": False,
     "all_classes": ["A", "B", "C", "D"],
     "language": "Chinese",
-    "pretrain": False,
+    "calculate_overall_loss": False,
     "max_new_tokens": 32,
 }
 
diff --git a/applications/ColossalEval/colossal_eval/dataset/cmmlu.py b/applications/ColossalEval/colossal_eval/dataset/cmmlu.py
index 05752c2486fa..402a2d4c8eab 100644
--- a/applications/ColossalEval/colossal_eval/dataset/cmmlu.py
+++ b/applications/ColossalEval/colossal_eval/dataset/cmmlu.py
@@ -81,7 +81,7 @@
     "calculate_loss": True,
     "all_classes": ["A", "B", "C", "D"],
     "language": "Chinese",
-    "pretrain": False,
+    "calculate_overall_loss": False,
     "max_new_tokens": 32,
 }
 
diff --git a/applications/ColossalEval/colossal_eval/dataset/colossalai.py b/applications/ColossalEval/colossal_eval/dataset/colossalai.py
index 0337454fa788..266eaef3f486 100644
--- a/applications/ColossalEval/colossal_eval/dataset/colossalai.py
+++ b/applications/ColossalEval/colossal_eval/dataset/colossalai.py
@@ -12,7 +12,7 @@
     "calculate_loss": False,
     "all_classes": None,
     "language": "Chinese",
-    "pretrain": False,
+    "calculate_overall_loss": False,
     "max_new_tokens": 256,
 }
 
diff --git a/applications/ColossalEval/colossal_eval/dataset/cvalues.py b/applications/ColossalEval/colossal_eval/dataset/cvalues.py
index 4023a4c76322..f5b81f90ed3f 100644
--- a/applications/ColossalEval/colossal_eval/dataset/cvalues.py
+++ b/applications/ColossalEval/colossal_eval/dataset/cvalues.py
@@ -15,7 +15,7 @@
     "calculate_loss": False,
     "all_classes": ["A", "B"],
     "language": LANGUAGE,
-    "pretrain": False,
+    "calculate_overall_loss": False,
     "max_new_tokens": 32,
 }
 
diff --git a/applications/ColossalEval/colossal_eval/dataset/gaokaobench.py b/applications/ColossalEval/colossal_eval/dataset/gaokaobench.py
index 44ccea9cfa2c..533e9b4bfa52 100644
--- a/applications/ColossalEval/colossal_eval/dataset/gaokaobench.py
+++ b/applications/ColossalEval/colossal_eval/dataset/gaokaobench.py
@@ -36,7 +36,7 @@
     "calculate_loss": True,
     "all_classes": None,
     "language": "Chinese",
-    "pretrain": False,
+    "calculate_overall_loss": False,
     "max_new_tokens": 32,
 }
 
diff --git a/applications/ColossalEval/colossal_eval/dataset/gsm.py b/applications/ColossalEval/colossal_eval/dataset/gsm.py
index 775c5843ff79..a639201053ef 100644
--- a/applications/ColossalEval/colossal_eval/dataset/gsm.py
+++ b/applications/ColossalEval/colossal_eval/dataset/gsm.py
@@ -72,7 +72,7 @@
     "calculate_loss": True,
     "all_classes": None,
     "language": "English",
-    "pretrain": False,
+    "calculate_overall_loss": False,
     "max_new_tokens": 256,
 }
 
@@ -114,7 +114,7 @@ def load(
             dataset[split][subject]["inference_kwargs"] = copy.deepcopy(default_inference_kwargs)
 
             if forward_only:
-                dataset[split][subject]["inference_kwargs"]["pretrain"] = True
+                dataset[split][subject]["inference_kwargs"]["calculate_overall_loss"] = True
 
             if split == "test" and few_shot:
                 dataset[split][subject]["inference_kwargs"]["few_shot_data"] = get_few_shot_data()
diff --git a/applications/ColossalEval/colossal_eval/dataset/longbench.py b/applications/ColossalEval/colossal_eval/dataset/longbench.py
index eb61efaa0d7c..e663e5e108e6 100644
--- a/applications/ColossalEval/colossal_eval/dataset/longbench.py
+++ b/applications/ColossalEval/colossal_eval/dataset/longbench.py
@@ -60,7 +60,7 @@
     "calculate_loss": True,
     "all_classes": None,
     "language": "Chinese",
-    "pretrain": False,
+    "calculate_overall_loss": False,
     "max_new_tokens": 32,
 }
 
diff --git a/applications/ColossalEval/colossal_eval/dataset/mmlu.py b/applications/ColossalEval/colossal_eval/dataset/mmlu.py
index e9465c91b3ce..5e3ff6af6ef3 100644
--- a/applications/ColossalEval/colossal_eval/dataset/mmlu.py
+++ b/applications/ColossalEval/colossal_eval/dataset/mmlu.py
@@ -11,7 +11,7 @@
     "calculate_loss": True,
     "all_classes": ["A", "B", "C", "D"],
     "language": "English",
-    "pretrain": False,
+    "calculate_overall_loss": False,
     "max_new_tokens": 32,
 }
 
diff --git a/applications/ColossalEval/colossal_eval/dataset/mtbench.py b/applications/ColossalEval/colossal_eval/dataset/mtbench.py
index ef474ec4ca23..abec8ebfb038 100644
--- a/applications/ColossalEval/colossal_eval/dataset/mtbench.py
+++ b/applications/ColossalEval/colossal_eval/dataset/mtbench.py
@@ -14,7 +14,7 @@
     "calculate_loss": False,
     "all_classes": None,
     "language": "English",
-    "pretrain": False,
+    "calculate_overall_loss": False,
     "max_new_tokens": 1024,
     "turns": 2,
 }
diff --git a/applications/ColossalEval/colossal_eval/dataset/safetybench_en.py b/applications/ColossalEval/colossal_eval/dataset/safetybench_en.py
index 8056c3dfd8bf..494bb0993ccf 100644
--- a/applications/ColossalEval/colossal_eval/dataset/safetybench_en.py
+++ b/applications/ColossalEval/colossal_eval/dataset/safetybench_en.py
@@ -28,7 +28,7 @@
     "calculate_loss": False,
     "all_classes": ["A", "B", "C", "D"],
     "language": LANGUAGE,
-    "pretrain": False,
+    "calculate_overall_loss": False,
     "max_new_tokens": 32,
 }
 
diff --git a/applications/ColossalEval/colossal_eval/dataset/safetybench_zh.py b/applications/ColossalEval/colossal_eval/dataset/safetybench_zh.py
index f5f17e64c991..8c41664c02c8 100644
--- a/applications/ColossalEval/colossal_eval/dataset/safetybench_zh.py
+++ b/applications/ColossalEval/colossal_eval/dataset/safetybench_zh.py
@@ -28,7 +28,7 @@
     "calculate_loss": False,
     "all_classes": ["A", "B", "C", "D"],
     "language": LANGUAGE,
-    "pretrain": False,
+    "calculate_overall_loss": False,
     "max_new_tokens": 32,
 }
 
diff --git a/applications/ColossalEval/colossal_eval/models/__init__.py b/applications/ColossalEval/colossal_eval/models/__init__.py
index 8f6c9b414145..ec557571ca07 100644
--- a/applications/ColossalEval/colossal_eval/models/__init__.py
+++ b/applications/ColossalEval/colossal_eval/models/__init__.py
@@ -1,5 +1,6 @@
 from .base import BaseModel
 from .chatglm import ChatGLM2Model, ChatGLMModel
 from .huggingface import HuggingFaceCausalLM, HuggingFaceModel
+from .vllm import vLLMModel
 
-__all__ = ["BaseModel", "HuggingFaceModel", "HuggingFaceCausalLM", "ChatGLMModel", "ChatGLM2Model"]
+__all__ = ["BaseModel", "HuggingFaceModel", "HuggingFaceCausalLM", "ChatGLMModel", "ChatGLM2Model", "vLLMModel"]
diff --git a/applications/ColossalEval/colossal_eval/models/chatglm.py b/applications/ColossalEval/colossal_eval/models/chatglm.py
index 9c70c0d2a1ad..4a48f4c0ed3e 100644
--- a/applications/ColossalEval/colossal_eval/models/chatglm.py
+++ b/applications/ColossalEval/colossal_eval/models/chatglm.py
@@ -28,7 +28,7 @@ def _get_truncated_prompts(self, inputs: List[str], max_new_tokens: int) -> List
 
     @torch.no_grad()
     def get_loss(
-        self, batch_prompt: List[str], batch_target: List[List[str]], pretrain: bool = False
+        self, batch_prompt: List[str], batch_target: List[List[str]], calculate_overall_loss: bool = False
     ) -> List[List[float]]:
         """
         Calculate loss only on target tokens.
@@ -225,7 +225,7 @@ def generate(self, inputs: List[str], max_new_tokens: int, **kwargs) -> List[str
 
     @torch.no_grad()
     def get_loss(
-        self, batch_prompt: List[str], batch_target: List[List[str]], pretrain: bool = False
+        self, batch_prompt: List[str], batch_target: List[List[str]], calculate_overall_loss: bool = False
     ) -> List[List[float]]:
         """
         Calculate loss only on target tokens.
diff --git a/applications/ColossalEval/colossal_eval/models/huggingface.py b/applications/ColossalEval/colossal_eval/models/huggingface.py
index e91743525f0e..200e282e7b2b 100644
--- a/applications/ColossalEval/colossal_eval/models/huggingface.py
+++ b/applications/ColossalEval/colossal_eval/models/huggingface.py
@@ -105,6 +105,12 @@ def _load_tokenizer(self, path: str, tokenizer_path: Optional[str], tokenizer_kw
             elif hasattr(self.tokenizer, "eod_id"):
                 # Qwen has an eod token "<|endoftext|>".
                 self.tokenizer.pad_token_id = self.tokenizer.eod_id
+            else:
+                self.logger.error("Neither eos_token nor eod_id is available for setting pad_token_id.")
+                raise ValueError(
+                    "The tokenizer does not have a pad_token_id, eos_token, or eod_id. "
+                    "Please set pad_token_id manually."
+                )
 
     def _load_model(
         self, path: str, model_kwargs: dict, peft_path: Optional[str] = None, shard_config: ShardConfig = None
@@ -245,7 +251,7 @@ def _get_input_ids_and_labels_pretrain(self, batch_prompt: List[str]) -> Tuple[L
         return input_ids_list, labels_list, bytes_list
 
     def _get_input_ids_and_labels(
-        self, batch_prompt: List[str], batch_target: List[List[str]], pretrain: bool
+        self, batch_prompt: List[str], batch_target: List[List[str]], calculate_overall_loss: bool
     ) -> Tuple[List[torch.LongTensor]]:
         """
         Get input_ids and labels for the given data.
@@ -258,7 +264,7 @@ def _get_input_ids_and_labels(
             Input_ids and labels for the given batch.
 
         """
-        if pretrain:
+        if calculate_overall_loss:
             batch = []
             # Concatenate prompt and target answers.
             # You should decide the concatenation character in the corresponding dataset script in dataset folder. For example, in line 119 dataset/gsm.py, the concatenation character is space.
@@ -342,7 +348,7 @@ def inference(self, data_loader: DataLoader, inference_kwargs: Dict[str, Any], d
         calculate_loss = inference_kwargs["calculate_loss"]
         classes = inference_kwargs["all_classes"]
         language = inference_kwargs["language"]
-        pretrain = inference_kwargs["pretrain"]
+        calculate_overall_loss = inference_kwargs["calculate_overall_loss"]
         max_new_tokens = inference_kwargs["max_new_tokens"]
         few_shot_data = inference_kwargs.get("few_shot_data", None)
 
@@ -384,12 +390,12 @@ def inference(self, data_loader: DataLoader, inference_kwargs: Dict[str, Any], d
                 self.logger.info("-" * 120)
                 self.logger.info(batch_prompt[0] + batch_target[0][0])
 
-            if not pretrain:
+            if not calculate_overall_loss:
                 batch_decodes, scores = self.generate(batch_prompt, max_new_tokens)
 
             if calculate_loss:
                 batch_losses, batch_target_token_nums, batch_bytes_nums = self.get_loss(
-                    batch_prompt, batch_target, pretrain
+                    batch_prompt, batch_target, calculate_overall_loss
                 )
 
             probs = []
@@ -409,7 +415,7 @@ def inference(self, data_loader: DataLoader, inference_kwargs: Dict[str, Any], d
                 ]
 
             for j in range(len(batch)):
-                if not pretrain:
+                if not calculate_overall_loss:
                     if isinstance(batch[j]["output"], list):
                         batch[j]["output"].append(batch_decodes[j].strip())
                     else:
@@ -496,7 +502,9 @@ def generate(self, inputs: List[str], max_new_tokens: int, **kwargs) -> List[str
         return decoded_sequences, scores
 
     @torch.no_grad()
-    def get_loss(self, batch_prompt: List[str], batch_target: List[List[str]], pretrain: bool) -> List[List[float]]:
+    def get_loss(
+        self, batch_prompt: List[str], batch_target: List[List[str]], calculate_overall_loss: bool
+    ) -> List[List[float]]:
         """
         Calculate loss only on target tokens.
 
@@ -513,13 +521,15 @@ def get_loss(self, batch_prompt: List[str], batch_target: List[List[str]], pretr
         # We don't need to generate new tokens.
         # Target answer's length is usually << model_max_length, but we still call it in case.
         # We don't call self._get_truncated_prompts for batch_prompt because we need target answer's length first to reserve some space for target answer's tokens.
-        if not pretrain:
+        if not calculate_overall_loss:
             batch_target = [self._get_truncated_prompts(prompt_target, 0) for prompt_target in batch_target]
 
         # Get the number of target answers for different questions
         batch_target_nums = [len(prompt_target) for prompt_target in batch_target]
 
-        input_ids_list, labels_list, bytes_list = self._get_input_ids_and_labels(batch_prompt, batch_target, pretrain)
+        input_ids_list, labels_list, bytes_list = self._get_input_ids_and_labels(
+            batch_prompt, batch_target, calculate_overall_loss
+        )
 
         # Because of multiple target answers, the final batch size may be greater than self.batch_size.
         # We will generate new batches.
diff --git a/applications/ColossalEval/colossal_eval/models/vllm.py b/applications/ColossalEval/colossal_eval/models/vllm.py
new file mode 100644
index 000000000000..2cbdb6e1b767
--- /dev/null
+++ b/applications/ColossalEval/colossal_eval/models/vllm.py
@@ -0,0 +1,498 @@
+import copy
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+from colossal_eval.utils import Conversation, get_batch_prompt, is_rank_0
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from vllm import LLM, SamplingParams
+
+from colossalai.logging import DistributedLogger
+
+from .huggingface import HuggingFaceModel
+
+IGNORE_INDEX = -100
+
+
+class vLLMModel(HuggingFaceModel):
+    """
+    Model wrapper around vLLM models.
+
+    Args:
+        path: The path to a vLLM model.
+        model_max_length: The maximum sequence length of the model.
+        tokenizer_path: The path to the tokenizer.
+        tokenizer_kwargs: Keyword arguments for the tokenizer.
+        model_kwargs: Keyword arguments for the model.
+        prompt_template: The model's prompt template.
+        batch_size: Batch size for inference.
+        logger: Logger for the model.
+        trust_remote_code: Trust remote code (e.g., from HuggingFace) when downloading the model and tokenizer.
+        tensor_parallel_size: The number of GPUs to use for distributed execution with tensor parallelism.
+        quantization: The method used to quantize the model weights
+        gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to reserve for the model weights, activations, and KV cache.
+        swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
+        cpu_offload_gb: The size (GiB) of CPU memory to use for offloading the model weights.
+        enforce_eager: Whether to enforce eager execution.
+        max_context_len_to_capture: Maximum context len covered by CUDA graphs.
+        max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
+        disable_custom_all_reduce: See ParallelConfig
+    """
+
+    def __init__(
+        self,
+        path: str,
+        model_max_length: int = 2048,
+        tokenizer_path: Optional[str] = None,
+        tokenizer_kwargs: Dict = None,
+        model_kwargs: Dict = None,
+        prompt_template: Conversation = None,
+        batch_size: int = 1,
+        logger: DistributedLogger = None,
+        trust_remote_code: bool = False,
+        tensor_parallel_size: int = 1,
+        quantization: Optional[str] = None,
+        gpu_memory_utilization: float = 0.5,
+        swap_space: float = 4,
+        cpu_offload_gb: float = 0,
+        enforce_eager: Optional[bool] = None,
+        max_context_len_to_capture: Optional[int] = None,
+        max_seq_len_to_capture: int = 8192,
+        disable_custom_all_reduce: bool = False,
+        **kwargs,
+    ):
+        super().__init__(
+            path=path,
+            model_max_length=model_max_length,
+            prompt_template=prompt_template,
+            batch_size=batch_size,
+            logger=logger,
+        )
+
+        self._load_model(
+            path=path,
+            model_kwargs=model_kwargs,
+            tokenizer_kwargs=tokenizer_kwargs,
+            tokenizer_path=tokenizer_path if tokenizer_path else None,
+            trust_remote_code=trust_remote_code,
+            tensor_parallel_size=tensor_parallel_size,
+            quantization=quantization,
+            gpu_memory_utilization=gpu_memory_utilization,
+            swap_space=swap_space,
+            cpu_offload_gb=cpu_offload_gb,
+            enforce_eager=enforce_eager,
+            max_context_len_to_capture=max_context_len_to_capture,
+            max_seq_len_to_capture=max_seq_len_to_capture,
+            disable_custom_all_reduce=disable_custom_all_reduce,
+        )
+
+    def _load_model(
+        self,
+        path: str,
+        model_kwargs: dict,
+        tokenizer_kwargs: dict,
+        tokenizer_path: Optional[str] = None,
+        trust_remote_code: bool = False,
+        tensor_parallel_size: int = 1,
+        quantization: Optional[str] = None,
+        gpu_memory_utilization: float = 0.9,
+        swap_space: float = 4,
+        cpu_offload_gb: float = 0,
+        enforce_eager: Optional[bool] = None,
+        max_context_len_to_capture: Optional[int] = None,
+        max_seq_len_to_capture: int = 8192,
+        disable_custom_all_reduce: bool = False,
+    ):
+        """
+        Load model.
+
+        Args:
+            path: The path to the model.
+            model_kwargs: Keyword arguments for the model.
+            tokenizer_kwargs: Keyword arguments for the tokenizer.
+            tokenizer_path: The path to the tokenizer.
+            trust_remote_code: Trust remote code (e.g., from HuggingFace) when downloading the model and tokenizer.
+            tensor_parallel_size: The number of GPUs to use for distributed execution with tensor parallelism.
+            quantization: The method used to quantize the model weights
+            gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to reserve for the model weights, activations, and KV cache.
+            swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
+            cpu_offload_gb: The size (GiB) of CPU memory to use for offloading the model weights.
+            enforce_eager: Whether to enforce eager execution.
+            max_context_len_to_capture: Maximum context len covered by CUDA graphs.
+            max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
+            disable_custom_all_reduce: See ParallelConfig
+
+        """
+        if "torch_dtype" in model_kwargs:
+            model_kwargs["dtype"] = eval(model_kwargs["torch_dtype"])
+            model_kwargs.pop("torch_dtype")
+        else:
+            model_kwargs.setdefault("dtype", torch.float16)
+
+        if "trust_remote_code" in model_kwargs:
+            trust_remote_code = model_kwargs["trust_remote_code"]
+            model_kwargs.pop("trust_remote_code")
+
+        if "trust_remote_code" in tokenizer_kwargs:
+            trust_remote_code = tokenizer_kwargs["trust_remote_code"]
+            tokenizer_kwargs.pop("trust_remote_code")
+
+        self.model = LLM(
+            model=path,
+            trust_remote_code=trust_remote_code,
+            tensor_parallel_size=tensor_parallel_size,
+            quantization=quantization,
+            gpu_memory_utilization=gpu_memory_utilization,
+            swap_space=swap_space,
+            cpu_offload_gb=cpu_offload_gb,
+            enforce_eager=enforce_eager,
+            max_context_len_to_capture=max_context_len_to_capture,
+            max_seq_len_to_capture=max_seq_len_to_capture,
+            disable_custom_all_reduce=disable_custom_all_reduce,
+            **model_kwargs,
+            **tokenizer_kwargs,
+        )
+
+        self.tokenizer = self.model.get_tokenizer()
+
+        if self.batch_size > 1:
+            self.tokenizer.padding_side = "left"
+            self.tokenizer.truncation_side = "left"
+
+        if self.tokenizer.pad_token_id is None:
+            self.logger.warning("pad_token_id is not set for the tokenizer. " "Using eos_token_id as pad_token_id.")
+            if self.tokenizer.eos_token:
+                self.tokenizer.pad_token = self.tokenizer.eos_token
+            elif hasattr(self.tokenizer, "eod_id"):
+                # Qwen has an eod token "<|endoftext|>".
+                self.tokenizer.pad_token_id = self.tokenizer.eod_id
+            else:
+                self.logger.error("Neither eos_token nor eod_id is available for setting pad_token_id.")
+                raise ValueError(
+                    "The tokenizer does not have a pad_token_id, eos_token, or eod_id. "
+                    "Please set pad_token_id manually."
+                )
+
+    def _calculate_loss(self, inputs: List[str], labels: List[str]) -> Tuple[List]:
+        """
+        Calculate loss on target tokens. Adapted from https://github.com/open-compass/opencompass/blob/c2bcd8725e615ec455bf5b7301f8d09962cd64e3/opencompass/models/vllm.py#L110
+
+        Args:
+            input_ids_list: A batch of input string.
+            labels: A batch of labels.
+
+        Returns:
+            A list of loss and a list of label length.
+
+        """
+        batch_size = len(inputs)
+        sampling_kwargs = SamplingParams(logprobs=1)
+        outputs = self.model.generate(inputs, sampling_kwargs)
+        ce_loss = []
+
+        if labels is not None:
+            lens = [len(self.tokenizer.encode(label, add_special_tokens=False)) for label in labels]
+        else:
+            lens = [1] * batch_size
+
+        for i in range(batch_size):
+            logprobs = outputs[i].outputs[0].logprobs
+            token_ids = outputs[i].outputs[0].token_ids
+
+            logprobs_list = [logprobs[i][token_ids[i]] for i in range(len(logprobs))]
+            logprobs_list = [i.logprob for i in logprobs_list]
+            logprobs_list = np.array(logprobs_list)
+
+            if lens is not None:
+                logprobs_list = logprobs_list[: lens[i]]
+
+            loss = -logprobs_list.sum(axis=-1) / lens[i]
+            ce_loss.append(loss)
+
+        batch_loss = np.array(ce_loss)
+
+        return batch_loss, lens
+
+    def inference(self, data_loader: DataLoader, inference_kwargs: Dict[str, Any], debug: bool = False) -> List[Dict]:
+        """
+        Infer the given data.
+        This function will call self.generate() to get model outputs and use LogitsProcessor param to get specific logits.
+
+        Args:
+            data: The data for inference.
+            inference_kwargs: Arguments for inference.
+            debug: Whether to display generated prompt for debugging.
+
+        Returns:
+            Inference results.
+
+        """
+        calculate_loss = inference_kwargs["calculate_loss"]
+        classes = inference_kwargs["all_classes"]
+        language = inference_kwargs["language"]
+        calculate_overall_loss = inference_kwargs["calculate_overall_loss"]
+        max_new_tokens = inference_kwargs["max_new_tokens"]
+        few_shot_data = inference_kwargs.get("few_shot_data", None)
+
+        # Some classification questions' options are texts not a single letter such as A, B, C and D.
+        # If the text length is greater than 1, we won't calculate loss over choices.
+        if classes is not None and any(len(c) > 1 for c in classes):
+            classes = None
+
+        self.choices = classes
+        self.indices_for_choices = None
+        if self.choices:
+            # Get indices for each choice
+            self._get_choices_indices(language)
+
+            self.str_label_map = {choice: idx for idx, choice in enumerate(self.choices)}
+
+        bar = tqdm(
+            range(len(data_loader)),
+            desc=f"{inference_kwargs['dataset']}-{inference_kwargs['category']} Inference steps",
+            disable=not is_rank_0(),
+        )
+        loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
+
+        answers = []
+
+        for i, batch in enumerate(data_loader):
+            batch_prompt, batch_target = get_batch_prompt(
+                self.prompt_template, batch, few_shot_data, self.tokenizer, self.model_max_length
+            )
+
+            if is_rank_0() and debug and i == 0:
+                self.logger.info(
+                    f"Inference arguments for dataset {batch[0]['dataset']} category {batch[0]['category']} is:\n{inference_kwargs}"
+                )
+                self.logger.info("-" * 120)
+                self.logger.info("An example prompt and prompt with target is:")
+                self.logger.info("-" * 120)
+                self.logger.info(batch_prompt[0])
+                self.logger.info("-" * 120)
+                self.logger.info(batch_prompt[0] + batch_target[0][0])
+
+            if not calculate_overall_loss:
+                batch_decodes, scores = self.generate(batch_prompt, max_new_tokens)
+
+            if calculate_loss:
+                batch_losses, batch_target_token_nums, batch_bytes_nums = self.get_loss(
+                    batch_prompt, batch_target, calculate_overall_loss
+                )
+
+            probs = []
+            if self.indices_for_choices:
+                scores = scores.to(torch.float32)
+                # If we have indices_for_choices(must be single-choice question), there will be only one target answer for one data sample.
+                # Otherwise this will violate the single-choice setting.
+
+                if calculate_loss:
+                    labels = [self.str_label_map[batch[j]["target"]] for j in range(len(batch))]
+
+                    loss_over_choices = loss_fct(scores, torch.tensor(labels, dtype=torch.long)).numpy().tolist()
+
+                probs = scores.numpy().tolist()
+                probs = [
+                    {choice: probs[i][self.str_label_map[choice]] for choice in self.choices} for i in range(len(probs))
+                ]
+
+            for j in range(len(batch)):
+                if not calculate_overall_loss:
+                    if isinstance(batch[j]["output"], list):
+                        batch[j]["output"].append(batch_decodes[j].strip())
+                    else:
+                        batch[j]["output"] = batch_decodes[j].strip()
+
+                    if isinstance(scores, torch.Tensor):
+                        batch[j]["logits_over_choices"] = probs[j]
+
+                        if calculate_loss:
+                            batch[j]["loss_over_choices"] = loss_over_choices[j]
+
+                if calculate_loss:
+                    batch[j]["loss"] = (np.array(batch_losses[j]) / np.array(batch_target_token_nums[j])).tolist()
+
+                    # loss_sum is specially used for pertrain dataset for calculating per-byte-perplexity.
+                    # However, loss (which is per sample loss) suffices for most cases.
+                    batch[j]["loss_sum"] = batch_losses[j]
+                    batch[j]["token_num"] = batch_target_token_nums[j]
+
+                    if batch_bytes_nums:
+                        batch[j]["byte_num"] = batch_bytes_nums[j]
+            answers.extend(batch)
+
+            bar.update()
+
+        return answers
+
+    @torch.no_grad()
+    def generate(self, inputs: List[str], max_new_tokens: int, **kwargs) -> List[str]:
+        """Generate results given a list of inputs and get logits of the first new token over choices.
+
+        Args:
+            inputs: A list of strings.
+            max_new_tokens: Max new tokens for generation.
+            kwargs: Key arguments for generation
+
+        Returns:
+            A list of generated strings and logits over choices.
+
+        Note:
+            Currently the function only returns the logits of the first new token.
+            It is used for single choice question.
+            For multiple choices question, please avoid using the loss over choices.
+            You should set argument choices as None in self.inference().
+
+        """
+        truncated_inputs = self._get_truncated_prompts(inputs, max_new_tokens)
+
+        generation_kwargs = kwargs.copy()
+        generation_kwargs.update({"max_tokens": max_new_tokens})
+        logits_processor = GetTokenLogitsProcessor(self.indices_for_choices)
+
+        sampling_kwargs = SamplingParams(logits_processors=[logits_processor], **generation_kwargs)
+
+        outputs = self.model.generate(truncated_inputs, sampling_kwargs)
+        output_strs = []
+        for output in outputs:
+            generated_text = output.outputs[0].text
+            output_strs.append(generated_text)
+        scores = logits_processor.get_target_logits()
+
+        return output_strs, scores
+
+    @torch.no_grad()
+    def get_loss(
+        self, batch_prompt: List[str], batch_target: List[List[str]], calculate_overall_loss: bool
+    ) -> List[List[float]]:
+        """
+        Calculate loss only on target tokens.
+
+        Args:
+            batch: A batch of prompt without target answer.
+            batch_target: A batch of target answer. Sometimes one question can have multiple target answers.
+
+        Returns:
+            Loss.
+
+        """
+
+        # We set max_new_tokens in self._get_truncated_prompts to 0 because we only need logits to calculate loss.
+        # We don't need to generate new tokens.
+        # Target answer's length is usually << model_max_length, but we still call it in case.
+        # We don't call self._get_truncated_prompts for batch_prompt because we need target answer's length first to reserve some space for target answer's tokens.
+        if not calculate_overall_loss:
+            batch_target = [self._get_truncated_prompts(prompt_target, 0) for prompt_target in batch_target]
+
+        # Get the number of target answers for different questions
+        batch_target_nums = [len(prompt_target) for prompt_target in batch_target]
+
+        if calculate_overall_loss:
+            batch = []
+            bytes_list = []
+            batch_prompt_pretrain = []
+            for p, b in zip(batch_prompt, batch_target):
+                batch.append(p + b[0])
+
+            for input in batch:
+                # Pretrain data tends to be very long, sometimes much larger than the model_max_length, we only tokenize 1/ratio of the data first to accelerate the tokenization process.
+                # Once the length of the result is greater or equal to model_max_length, we stop iterating on ratios and use the result as input_ids and labels.
+                # After all, the rest of the original string doesn't need to be tokenized at the first place.
+                # Pretrain data tends to be very long, sometimes much larger than the model_max_length, we only tokenize 1/ratio of the data first to accelerate the tokenization process.
+                # Once the length of the result is greater or equal to model_max_length, we stop iterating on ratios and use the result as input_ids and labels.
+                # After all, the rest of the original string doesn't need to be tokenized at the first place.
+                ratio = [16, 8, 4, 2, 1]
+                tokenized = None
+                for r in ratio:
+                    tokenized = self.tokenizer(
+                        [input[0 : len(input) // r]],
+                        truncation=True,
+                        max_length=self.model_max_length,
+                        return_tensors="pt",
+                    )
+                    if tokenized.input_ids.size(1) >= self.model_max_length:
+                        break
+
+                string = self.tokenizer.decode(tokenized.input_ids[0], skip_special_tokens=True)
+                batch_prompt_pretrain.append(string)
+                bytes_list.append(len(string.encode("utf-8")))
+
+            batch_prompt = copy.deepcopy(batch_prompt_pretrain)
+            batch_target = None
+        else:
+            batch_prompt_processed = []
+            batch_target_processed = []
+            for prompt, targets in zip(batch_prompt, batch_target):
+                for target in targets:
+                    target_tokenized = self.tokenizer(
+                        [target], truncation=True, max_length=self.model_max_length, return_tensors="pt"
+                    )
+                    max_new_tokens = target_tokenized["input_ids"][0].size(0)
+                    prompt_with_correct_length = self._get_truncated_prompts([prompt], max_new_tokens)[0]
+                    batch_prompt_processed.append(prompt_with_correct_length)
+                    batch_target_processed.append(target)
+
+            batch_prompt = copy.deepcopy(batch_prompt_processed)
+            batch_target = copy.deepcopy(batch_target_processed)
+            bytes_list = None
+
+        # Because of multiple target answers, the final batch size may be greater than self.batch_size.
+        # We will generate new batches.
+        losses = []
+        target_token_nums = []
+
+        losses_per_batch, target_token_num_per_batch = self._calculate_loss(batch_prompt, batch_target)
+        losses.extend(losses_per_batch)
+        target_token_nums.extend(target_token_num_per_batch)
+
+        start_indice = 0
+        losses_per_sample = []
+
+        target_token_nums_per_sample = []
+        bytes_nums_per_sample = []
+        for length in batch_target_nums:
+            losses_per_sample.append(losses[start_indice : start_indice + length])
+            target_token_nums_per_sample.append(target_token_nums[start_indice : start_indice + length])
+
+            if bytes_list:
+                bytes_nums_per_sample.append(bytes_list[start_indice : start_indice + length])
+
+            start_indice += length
+
+        if bytes_list:
+            return losses_per_sample, target_token_nums_per_sample, bytes_nums_per_sample
+
+        return losses_per_sample, target_token_nums_per_sample, None
+
+
+class GetTokenLogitsProcessor:
+    """
+    LogitsProcessor to get specific logits
+
+    Args:
+        indices_for_choices: token indices of required tokens
+        target_logits: store all the target logits
+    """
+
+    def __init__(
+        self,
+        indices_for_choices: List[List[int]],
+    ):
+        self.indices_for_choices = (indices_for_choices,)
+        self.target_logits = []
+
+    def __call__(self, input_ids: torch.Tensor, logits: torch.Tensor) -> torch.Tensor:
+        choice_scores = []
+
+        if not input_ids:
+            for option_indices in self.indices_for_choices[0]:
+                choice_scores.append(logits[option_indices].detach().cpu())
+
+            choice_scores = torch.max(torch.stack(choice_scores), dim=0)[0]
+            self.target_logits.append(choice_scores)
+
+        return logits
+
+    def get_target_logits(self) -> torch.Tensor:
+        return torch.stack(self.target_logits) if self.target_logits else torch.tensor([])
diff --git a/applications/ColossalEval/examples/dataset_evaluation/inference.py b/applications/ColossalEval/examples/dataset_evaluation/inference.py
index c651970ee37c..1d3f13745474 100644
--- a/applications/ColossalEval/examples/dataset_evaluation/inference.py
+++ b/applications/ColossalEval/examples/dataset_evaluation/inference.py
@@ -69,7 +69,7 @@ def rm_and_merge(
                         os.remove(directory)
                     except Exception as e:
                         print(e)
-                print(len(answers["data"]))
+
                 all_answers[category] = answers
 
             all_answers_with_dataset_class["inference_results"] = all_answers
diff --git a/applications/ColossalEval/requirements.txt b/applications/ColossalEval/requirements.txt
index c5b9bad549e2..f9985b49f9ed 100644
--- a/applications/ColossalEval/requirements.txt
+++ b/applications/ColossalEval/requirements.txt
@@ -10,3 +10,4 @@ matplotlib
 pandas
 seaborn
 scikit-learn
+vllm==0.5.5

From dabc2e7430bb5d6bad1fed7629b534c10ae8c609 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Thu, 19 Sep 2024 10:45:32 +0800
Subject: [PATCH 093/167] [release] update version (#6062)

---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index 17b2ccd9bf90..6f2743d65dc0 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.4.3
+0.4.4

From cbaa1042162d71b4fa6224270e4c8e1c2f77803d Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Wed, 25 Sep 2024 11:57:16 +0800
Subject: [PATCH 094/167] release FP8 news (#6068)

* add FP8 news

* release FP8 news

* release FP8 news
---
 README.md              | 1 +
 docs/README-zh-Hans.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 22c565b5058d..d67d8ed35527 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,7 @@
 </div>
 
 ## Latest News
+* [2024/09] [Reducing AI Large Model Training Costs by 30% Requires Just a Single Line of Code From FP8 Mixed Precision Training Upgrades](https://company.hpc-ai.com/blog/reducing-ai-large-model-training-costs-by-30-requires-just-a-single-line-of-code-from-fp8-mixed-precision-training-upgrades)
 * [2024/06] [Open-Sora Continues Open Source: Generate Any 16-Second 720p HD Video with One Click, Model Weights Ready to Use](https://hpc-ai.com/blog/open-sora-from-hpc-ai-tech-team-continues-open-source-generate-any-16-second-720p-hd-video-with-one-click-model-weights-ready-to-use)
 * [2024/05] [Large AI Models Inference Speed Doubled, Colossal-Inference Open Source Release](https://hpc-ai.com/blog/colossal-inference)
 * [2024/04] [Open-Sora Unveils Major Upgrade: Embracing Open Source with Single-Shot 16-Second Video Generation and 720p Resolution](https://hpc-ai.com/blog/open-soras-comprehensive-upgrade-unveiled-embracing-16-second-video-generation-and-720p-resolution-in-open-source)
diff --git a/docs/README-zh-Hans.md b/docs/README-zh-Hans.md
index ef121d348a1a..11e07375c30e 100644
--- a/docs/README-zh-Hans.md
+++ b/docs/README-zh-Hans.md
@@ -25,6 +25,7 @@
 </div>
 
 ## 新闻
+* [2024/09] [Reducing AI Large Model Training Costs by 30% Requires Just a Single Line of Code From FP8 Mixed Precision Training Upgrades](https://company.hpc-ai.com/blog/reducing-ai-large-model-training-costs-by-30-requires-just-a-single-line-of-code-from-fp8-mixed-precision-training-upgrades)
 * [2024/06] [Open-Sora Continues Open Source: Generate Any 16-Second 720p HD Video with One Click, Model Weights Ready to Use](https://hpc-ai.com/blog/open-sora-from-hpc-ai-tech-team-continues-open-source-generate-any-16-second-720p-hd-video-with-one-click-model-weights-ready-to-use)
 * [2024/05] [Large AI Models Inference Speed Doubled, Colossal-Inference Open Source Release](https://hpc-ai.com/blog/colossal-inference)
 * [2024/04] [Open-Sora Unveils Major Upgrade: Embracing Open Source with Single-Shot 16-Second Video Generation and 720p Resolution](https://hpc-ai.com/blog/open-soras-comprehensive-upgrade-unveiled-embracing-16-second-video-generation-and-720p-resolution-in-open-source)

From cfd9eda628cb9a3e7c6ecb15daeca5b93741fa12 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Wed, 25 Sep 2024 18:34:29 +0800
Subject: [PATCH 095/167] fix the ring attn

---
 colossalai/shardformer/layer/attn.py     | 37 +++++++++++++++---------
 colossalai/shardformer/modeling/llama.py |  2 ++
 2 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 5f0e9261c0de..1f897c1be2e0 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -4,6 +4,7 @@
 import torch
 import torch.distributed
 import torch.distributed as dist
+from torch.distributed import ProcessGroup
 import torch.nn.functional as F
 from einops import rearrange
 
@@ -431,7 +432,7 @@ class RingAttention(torch.autograd.Function):
     INTER_RING_GROUP_COPY: dist.ProcessGroup = None
 
     @staticmethod
-    def get_double_ring_groups(sp_group, inner_ring_size=None):
+    def get_double_ring_groups(sp_group,tp_group, inner_ring_size=None):
         """
         Get 2D ring groups for the given process group. Generally, to avoid congestion, the inner ring size
         shouldn't be larger than the number of NICs on each node.
@@ -443,6 +444,7 @@ def get_double_ring_groups(sp_group, inner_ring_size=None):
         """
         sp_size = dist.get_world_size(sp_group)
         sp_rank = dist.get_rank(sp_group)
+        tp_size = dist.get_world_size(tp_group)
 
         if inner_ring_size is None:
             if torch.cuda.device_count() >= dist.get_world_size():
@@ -471,19 +473,24 @@ def get_double_ring_groups(sp_group, inner_ring_size=None):
         inner_ring_group = None
         inter_ring_group = None
 
+        world_size = dist.get_world_size()
+        rank = dist.get_rank()
+        groups = int(world_size/ sp_size)
         # Create inner ring groups
-        for i in range(inner_ring_size):
-            ranks = list(range(i * inner_ring_size, (i + 1) * inner_ring_size))
-            group = dist.new_group(ranks)
-            if sp_rank in ranks:
-                inner_ring_group = group
+        for group_id in range(groups):
+            for i in range(inner_ring_size):
+                ranks = list(range(i +(group_id*sp_size), (1+group_id)*sp_size, inner_ring_size))
+                group = dist.new_group(ranks)
+                if rank in ranks:
+                    inner_ring_group = group
 
         # Create inter ring groups
-        for i in range(num_rings):
-            ranks = list(range(i, sp_size, num_rings))
-            group = dist.new_group(ranks)
-            if sp_rank in ranks:
-                inter_ring_group = group
+        for group_id in range(groups):
+            for i in range(num_rings):
+                ranks = list(range(i+group_id * num_rings, world_size, sp_size))
+                group = dist.new_group(ranks)
+                if rank in ranks:
+                    inter_ring_group = group
 
         return inner_ring_group, inter_ring_group
 
@@ -493,6 +500,7 @@ def attention(
         k,
         v,
         sp_group,
+        tp_group,
         attention_mask_type,
         cu_seqlens=None,
         max_seqlen=None,
@@ -537,7 +545,6 @@ def attention(
             RingAttention.ATTN_DONE = torch.cuda.Event()
         if RingAttention.SP_STREAM is None:
             RingAttention.SP_STREAM = torch.cuda.Stream()
-
         assert (
             q.shape[2] == k.shape[2]
         ), "Q, K and V having different sequence lengths (inference or cross-attn)\
@@ -550,7 +557,7 @@ def attention(
 
         if RingAttention.SP_GROUP is not sp_group:
             RingAttention.SP_GROUP = sp_group
-            inner_ring_group, inter_ring_group = RingAttention.get_double_ring_groups(sp_group, inner_ring_size)
+            inner_ring_group, inter_ring_group = RingAttention.get_double_ring_groups(sp_group, tp_group, inner_ring_size)
             RingAttention.INNER_RING_GROUP = inner_ring_group
             RingAttention.INTER_RING_GROUP = inter_ring_group
         else:
@@ -597,6 +604,7 @@ def attention(
             attention_mask_type == AttnMaskType.PADDED_CAUSAL,
             inner_ring_group,
             inter_ring_group,
+            tp_group,
         )
 
         if attention_mask_type == AttnMaskType.PADDED_CAUSAL:
@@ -627,6 +635,7 @@ def forward(
         is_packed: Optional[bool] = False,
         inner_ring_group: Optional[dist.ProcessGroup] = None,
         inter_ring_group: Optional[dist.ProcessGroup] = None,
+        tp_group: Optional[dist.ProcessGroup] = None,
     ):
 
         cu_seqlens_q = cu_seqlens_kv = cu_seqlens
@@ -1123,7 +1132,7 @@ def _other_ring_backward(ring_num_idx, dq):
         if not is_packed:
             dq, dk, dv = [x.view(b, sq, *x.shape[-2:]) for x in (dq, dk, dv)]
 
-        return (dq, dk, dv, None, None, None, None, None, None, None, None, None, None, None)
+        return (dq, dk, dv, None, None, None, None, None, None, None, None, None, None, None, None)
 
     @staticmethod
     def prepare_varlen_batch(
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index 47c17e7494f2..fc5bcac6b8d4 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -563,12 +563,14 @@ def forward(
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
+        tp_group = shard_config.tensor_parallel_process_group
         if sp_mode == "ring_attn":
             attn_output = RingAttention.attention(
                 query_states,
                 key_states,
                 value_states,
                 sp_group,
+                tp_group,
                 **attention_mask,
                 inner_ring_size=shard_config.inner_ring_size,
             )

From 65c829771058433a9f7b88299bd1925d05d53554 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Wed, 25 Sep 2024 18:51:03 +0800
Subject: [PATCH 096/167] fix the attn

---
 colossalai/shardformer/layer/attn.py    | 2 +-
 colossalai/shardformer/modeling/gpt2.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 1f897c1be2e0..419932a00b28 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -500,7 +500,7 @@ def attention(
         k,
         v,
         sp_group,
-        tp_group,
+        tp_group : Optional[dist.ProcessGroup],
         attention_mask_type,
         cu_seqlens=None,
         max_seqlen=None,
diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index 798fca88fb4f..8f476ab86a26 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -858,12 +858,14 @@ def forward(
 
         sp_mode = shard_config.sequence_parallelism_mode
         sp_group = shard_config.sequence_parallel_process_group
+        tp_group = shard_config.tensor_parallel_process_group
         if sp_mode == "ring_attn":
             attn_output = RingAttention.attention(
                 query,
                 key,
                 value,
                 sp_group,
+                tp_group,
                 **attention_mask,
                 dropout_p=dropout_p,
                 scale=scale,

From 6fb1322db1525c90f7f80cebad4b447e009bacd4 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Wed, 25 Sep 2024 18:56:18 +0800
Subject: [PATCH 097/167] fix

---
 colossalai/shardformer/layer/attn.py     | 20 +++++++++++---------
 colossalai/shardformer/modeling/gpt2.py  |  1 +
 colossalai/shardformer/modeling/llama.py |  1 +
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 419932a00b28..15ad09baa3df 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -4,7 +4,6 @@
 import torch
 import torch.distributed
 import torch.distributed as dist
-from torch.distributed import ProcessGroup
 import torch.nn.functional as F
 from einops import rearrange
 
@@ -432,7 +431,7 @@ class RingAttention(torch.autograd.Function):
     INTER_RING_GROUP_COPY: dist.ProcessGroup = None
 
     @staticmethod
-    def get_double_ring_groups(sp_group,tp_group, inner_ring_size=None):
+    def get_double_ring_groups(sp_group, tp_group, inner_ring_size=None):
         """
         Get 2D ring groups for the given process group. Generally, to avoid congestion, the inner ring size
         shouldn't be larger than the number of NICs on each node.
@@ -443,8 +442,8 @@ def get_double_ring_groups(sp_group,tp_group, inner_ring_size=None):
             Tuple[dist.ProcessGroup, dist.ProcessGroup]: Inner-ring process group and inter-ring process group.
         """
         sp_size = dist.get_world_size(sp_group)
-        sp_rank = dist.get_rank(sp_group)
-        tp_size = dist.get_world_size(tp_group)
+        dist.get_rank(sp_group)
+        dist.get_world_size(tp_group)
 
         if inner_ring_size is None:
             if torch.cuda.device_count() >= dist.get_world_size():
@@ -475,11 +474,12 @@ def get_double_ring_groups(sp_group,tp_group, inner_ring_size=None):
 
         world_size = dist.get_world_size()
         rank = dist.get_rank()
-        groups = int(world_size/ sp_size)
+        groups = int(world_size / sp_size)
+
         # Create inner ring groups
         for group_id in range(groups):
             for i in range(inner_ring_size):
-                ranks = list(range(i +(group_id*sp_size), (1+group_id)*sp_size, inner_ring_size))
+                ranks = list(range(i + (group_id * sp_size), (1 + group_id) * sp_size, inner_ring_size))
                 group = dist.new_group(ranks)
                 if rank in ranks:
                     inner_ring_group = group
@@ -487,7 +487,7 @@ def get_double_ring_groups(sp_group,tp_group, inner_ring_size=None):
         # Create inter ring groups
         for group_id in range(groups):
             for i in range(num_rings):
-                ranks = list(range(i+group_id * num_rings, world_size, sp_size))
+                ranks = list(range(i + group_id * num_rings, world_size, sp_size))
                 group = dist.new_group(ranks)
                 if rank in ranks:
                     inter_ring_group = group
@@ -500,7 +500,7 @@ def attention(
         k,
         v,
         sp_group,
-        tp_group : Optional[dist.ProcessGroup],
+        tp_group: Optional[dist.ProcessGroup],
         attention_mask_type,
         cu_seqlens=None,
         max_seqlen=None,
@@ -557,7 +557,9 @@ def attention(
 
         if RingAttention.SP_GROUP is not sp_group:
             RingAttention.SP_GROUP = sp_group
-            inner_ring_group, inter_ring_group = RingAttention.get_double_ring_groups(sp_group, tp_group, inner_ring_size)
+            inner_ring_group, inter_ring_group = RingAttention.get_double_ring_groups(
+                sp_group, tp_group, inner_ring_size
+            )
             RingAttention.INNER_RING_GROUP = inner_ring_group
             RingAttention.INTER_RING_GROUP = inter_ring_group
         else:
diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index 8f476ab86a26..01d47bcd0b6f 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -859,6 +859,7 @@ def forward(
         sp_mode = shard_config.sequence_parallelism_mode
         sp_group = shard_config.sequence_parallel_process_group
         tp_group = shard_config.tensor_parallel_process_group
+
         if sp_mode == "ring_attn":
             attn_output = RingAttention.attention(
                 query,
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index fc5bcac6b8d4..b5f505dceab3 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -564,6 +564,7 @@ def forward(
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
         tp_group = shard_config.tensor_parallel_process_group
+
         if sp_mode == "ring_attn":
             attn_output = RingAttention.attention(
                 query_states,

From 91ed32c2569b18f03a870f82f7f3ddb4b2da4e4f Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Wed, 25 Sep 2024 19:00:38 +0800
Subject: [PATCH 098/167] fix

---
 colossalai/shardformer/layer/attn.py     | 2 +-
 colossalai/shardformer/modeling/gpt2.py  | 2 +-
 colossalai/shardformer/modeling/llama.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 15ad09baa3df..2cc6f31639f8 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -500,8 +500,8 @@ def attention(
         k,
         v,
         sp_group,
-        tp_group: Optional[dist.ProcessGroup],
         attention_mask_type,
+        tp_group=None,
         cu_seqlens=None,
         max_seqlen=None,
         valid_indices=None,
diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index 01d47bcd0b6f..6be75a3c6bf4 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -866,7 +866,7 @@ def forward(
                 key,
                 value,
                 sp_group,
-                tp_group,
+                tp_group=tp_group,
                 **attention_mask,
                 dropout_p=dropout_p,
                 scale=scale,
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index b5f505dceab3..08f4bc90d6cd 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -571,7 +571,7 @@ def forward(
                 key_states,
                 value_states,
                 sp_group,
-                tp_group,
+                tp_group=tp_group,
                 **attention_mask,
                 inner_ring_size=shard_config.inner_ring_size,
             )

From 6705dad41b001b75a94f1f92da48df666f5e7b55 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Wed, 25 Sep 2024 19:02:21 +0800
Subject: [PATCH 099/167] fix

---
 colossalai/shardformer/layer/attn.py     | 2 +-
 colossalai/shardformer/modeling/gpt2.py  | 2 +-
 colossalai/shardformer/modeling/llama.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 2cc6f31639f8..b7738c7e2a10 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -501,7 +501,6 @@ def attention(
         v,
         sp_group,
         attention_mask_type,
-        tp_group=None,
         cu_seqlens=None,
         max_seqlen=None,
         valid_indices=None,
@@ -510,6 +509,7 @@ def attention(
         deterministic=False,
         return_softmax=False,
         inner_ring_size=None,
+        tp_group=None,
         **kwargs,
     ):
         """
diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index 6be75a3c6bf4..43b3d2c5e52f 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -866,11 +866,11 @@ def forward(
                 key,
                 value,
                 sp_group,
-                tp_group=tp_group,
                 **attention_mask,
                 dropout_p=dropout_p,
                 scale=scale,
                 inner_ring_size=shard_config.inner_ring_size,
+                tp_group=tp_group,
             )
         else:
             attn_output = ColoAttention.attention(query, key, value, **attention_mask, dropout_p=dropout_p, scale=scale)
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index 08f4bc90d6cd..aa761fd211e7 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -571,9 +571,9 @@ def forward(
                 key_states,
                 value_states,
                 sp_group,
-                tp_group=tp_group,
                 **attention_mask,
                 inner_ring_size=shard_config.inner_ring_size,
+                tp_group=tp_group,
             )
 
         elif shard_config.enable_flash_attention:

From f4daf042700a0ec3ed718a4c37e8b274d5052744 Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Thu, 26 Sep 2024 12:29:27 +0800
Subject: [PATCH 100/167] add funding news (#6072)

* add funding news

* add funding news

* add funding news
---
 README.md              | 2 +-
 docs/README-zh-Hans.md | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d67d8ed35527..dcd1e261a263 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,7 @@
 </div>
 
 ## Latest News
+* [2024/09] [Singapore Startup HPC-AI Tech Secures 50 Million USD in Series A Funding to Build the Video Generation AI Model and GPU Platform](https://company.hpc-ai.com/blog/singapore-startup-hpc-ai-tech-secures-50-million-usd-in-series-a-funding-to-build-the-video-generation-ai-model-and-gpu-platform)
 * [2024/09] [Reducing AI Large Model Training Costs by 30% Requires Just a Single Line of Code From FP8 Mixed Precision Training Upgrades](https://company.hpc-ai.com/blog/reducing-ai-large-model-training-costs-by-30-requires-just-a-single-line-of-code-from-fp8-mixed-precision-training-upgrades)
 * [2024/06] [Open-Sora Continues Open Source: Generate Any 16-Second 720p HD Video with One Click, Model Weights Ready to Use](https://hpc-ai.com/blog/open-sora-from-hpc-ai-tech-team-continues-open-source-generate-any-16-second-720p-hd-video-with-one-click-model-weights-ready-to-use)
 * [2024/05] [Large AI Models Inference Speed Doubled, Colossal-Inference Open Source Release](https://hpc-ai.com/blog/colossal-inference)
@@ -35,7 +36,6 @@
 * [2024/03] [Open-Sora: Revealing Complete Model Parameters, Training Details, and Everything for Sora-like Video Generation Models](https://hpc-ai.com/blog/open-sora-v1.0)
 * [2024/03] [Open-Sora：Sora Replication Solution with 46% Cost Reduction, Sequence Expansion to Nearly a Million](https://hpc-ai.com/blog/open-sora)
 * [2024/01] [Inference Performance Improved by 46%, Open Source Solution Breaks the Length Limit of LLM for Multi-Round Conversations](https://hpc-ai.com/blog/Colossal-AI-SwiftInfer)
-* [2023/07] [HPC-AI Tech Raises 22 Million USD in Series A Funding](https://www.hpc-ai.tech/blog/hpc-ai-tech-raises-22-million-usd-in-series-a-funding-to-fuel-team-expansion-and-business-growth)
 
 ## Table of Contents
 <ul>
diff --git a/docs/README-zh-Hans.md b/docs/README-zh-Hans.md
index 11e07375c30e..2f3075bbd71a 100644
--- a/docs/README-zh-Hans.md
+++ b/docs/README-zh-Hans.md
@@ -25,6 +25,7 @@
 </div>
 
 ## 新闻
+* [2024/09] [Singapore Startup HPC-AI Tech Secures 50 Million USD in Series A Funding to Build the Video Generation AI Model and GPU Platform](https://company.hpc-ai.com/blog/singapore-startup-hpc-ai-tech-secures-50-million-usd-in-series-a-funding-to-build-the-video-generation-ai-model-and-gpu-platform)
 * [2024/09] [Reducing AI Large Model Training Costs by 30% Requires Just a Single Line of Code From FP8 Mixed Precision Training Upgrades](https://company.hpc-ai.com/blog/reducing-ai-large-model-training-costs-by-30-requires-just-a-single-line-of-code-from-fp8-mixed-precision-training-upgrades)
 * [2024/06] [Open-Sora Continues Open Source: Generate Any 16-Second 720p HD Video with One Click, Model Weights Ready to Use](https://hpc-ai.com/blog/open-sora-from-hpc-ai-tech-team-continues-open-source-generate-any-16-second-720p-hd-video-with-one-click-model-weights-ready-to-use)
 * [2024/05] [Large AI Models Inference Speed Doubled, Colossal-Inference Open Source Release](https://hpc-ai.com/blog/colossal-inference)

From 3fab92166e9547359296d29ebda5b4ce03437502 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Thu, 26 Sep 2024 18:03:09 +0800
Subject: [PATCH 101/167] fix

---
 colossalai/shardformer/layer/attn.py | 46 +++++++++++++++++++---------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index b7738c7e2a10..21dbe718e38c 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -442,8 +442,7 @@ def get_double_ring_groups(sp_group, tp_group, inner_ring_size=None):
             Tuple[dist.ProcessGroup, dist.ProcessGroup]: Inner-ring process group and inter-ring process group.
         """
         sp_size = dist.get_world_size(sp_group)
-        dist.get_rank(sp_group)
-        dist.get_world_size(tp_group)
+        tp_size = dist.get_world_size(tp_group)
 
         if inner_ring_size is None:
             if torch.cuda.device_count() >= dist.get_world_size():
@@ -476,21 +475,38 @@ def get_double_ring_groups(sp_group, tp_group, inner_ring_size=None):
         rank = dist.get_rank()
         groups = int(world_size / sp_size)
 
-        # Create inner ring groups
-        for group_id in range(groups):
-            for i in range(inner_ring_size):
-                ranks = list(range(i + (group_id * sp_size), (1 + group_id) * sp_size, inner_ring_size))
-                group = dist.new_group(ranks)
+        if tp_size > 1:
+            for group_id in range(groups):
+                for i in range(inner_ring_size):
+                    ranks = list(range(i + (group_id * sp_size), (1 + group_id) * sp_size, inner_ring_size))
+                    group = dist.new_group(ranks)
+                    if rank in ranks:
+                        inner_ring_group = group
+            for group_id in range(groups):
+                for i in range(num_rings):
+                    ranks = list(range(i + group_id * num_rings, world_size, sp_size))
+                    group = dist.new_group(ranks)
+                    if rank in ranks:
+                        inter_ring_group = group
+        else:
+            for i in range(sp_size // 2):
+                ranks = list(range((i) * num_rings, (i + 1) * num_rings, 1))
                 if rank in ranks:
+                    print(
+                        "rank:",
+                        rank,
+                        "inner ranks:",
+                        ranks,
+                    )
+                    group = dist.new_group(ranks)
                     inner_ring_group = group
-
-        # Create inter ring groups
-        for group_id in range(groups):
-            for i in range(num_rings):
-                ranks = list(range(i + group_id * num_rings, world_size, sp_size))
-                group = dist.new_group(ranks)
-                if rank in ranks:
-                    inter_ring_group = group
+            for group_id in range(num_rings):
+                for i in range(num_rings):
+                    ranks = list(range(i + group_id * num_rings, world_size, inner_ring_size))
+                    ranks = [0, 1, 4, 5] if rank == 0 or rank == 1 or rank == 4 or rank == 5 else [2, 3, 6, 7]
+                    if rank in ranks:
+                        group = dist.new_group(ranks)
+                        inter_ring_group = group
 
         return inner_ring_group, inter_ring_group
 

From 3532f77b90ccec4e94e3d8dc2b2e0ac76008cab9 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Wed, 9 Oct 2024 10:57:19 +0800
Subject: [PATCH 102/167] fix

---
 colossalai/shardformer/layer/attn.py | 49 ++++++++++++++--------------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 21dbe718e38c..36a4ec963c1c 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -443,6 +443,7 @@ def get_double_ring_groups(sp_group, tp_group, inner_ring_size=None):
         """
         sp_size = dist.get_world_size(sp_group)
         tp_size = dist.get_world_size(tp_group)
+        sp_rank = dist.get_rank(sp_group)
 
         if inner_ring_size is None:
             if torch.cuda.device_count() >= dist.get_world_size():
@@ -467,46 +468,44 @@ def get_double_ring_groups(sp_group, tp_group, inner_ring_size=None):
             f"Using 2D Ring Attention with inner ring size {inner_ring_size} to maximze NIC util for inter-node comm. Cross your fingers for speed-ups!",
             ranks=[0],
         )
-        num_rings = sp_size // inner_ring_size
+
         inner_ring_group = None
         inter_ring_group = None
 
         world_size = dist.get_world_size()
         rank = dist.get_rank()
-        groups = int(world_size / sp_size)
+
+        inner_rings = world_size // sp_size
+        num_rings = sp_size // inner_ring_size
 
         if tp_size > 1:
-            for group_id in range(groups):
-                for i in range(inner_ring_size):
-                    ranks = list(range(i + (group_id * sp_size), (1 + group_id) * sp_size, inner_ring_size))
+            for i in range(inner_rings):
+                for j in range(sp_size // tp_size):
+                    # find inner ring group in one sp group
+                    ranks = list(range(j + i * sp_size, j + (i + 1) * sp_size, tp_size))
                     group = dist.new_group(ranks)
                     if rank in ranks:
                         inner_ring_group = group
-            for group_id in range(groups):
-                for i in range(num_rings):
-                    ranks = list(range(i + group_id * num_rings, world_size, sp_size))
+            for i in range(inner_rings):
+                for j in range(sp_size // tp_size):
+                    ranks = list(range(j + i * (sp_size // tp_size), inner_rings + (i + 1) * sp_size, sp_size))
                     group = dist.new_group(ranks)
                     if rank in ranks:
                         inter_ring_group = group
         else:
-            for i in range(sp_size // 2):
-                ranks = list(range((i) * num_rings, (i + 1) * num_rings, 1))
-                if rank in ranks:
-                    print(
-                        "rank:",
-                        rank,
-                        "inner ranks:",
-                        ranks,
-                    )
-                    group = dist.new_group(ranks)
+            # Create inner ring groups
+            for i in range(inner_ring_size):
+                ranks = list(range(i * inner_ring_size, (i + 1) * inner_ring_size))
+                group = dist.new_group(ranks)
+                if sp_rank in ranks:
                     inner_ring_group = group
-            for group_id in range(num_rings):
-                for i in range(num_rings):
-                    ranks = list(range(i + group_id * num_rings, world_size, inner_ring_size))
-                    ranks = [0, 1, 4, 5] if rank == 0 or rank == 1 or rank == 4 or rank == 5 else [2, 3, 6, 7]
-                    if rank in ranks:
-                        group = dist.new_group(ranks)
-                        inter_ring_group = group
+
+            # Create inter ring groups
+            for i in range(num_rings):
+                ranks = list(range(i, sp_size, num_rings))
+                group = dist.new_group(ranks)
+                if sp_rank in ranks:
+                    inter_ring_group = group
 
         return inner_ring_group, inter_ring_group
 

From 3f5bec8dc41fed481b8a14bbb463476d4e98191d Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Wed, 9 Oct 2024 03:58:01 +0000
Subject: [PATCH 103/167] [feat] support zbv in mixtral benchmark;

---
 examples/language/mixtral/benchmark.py | 29 ++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/examples/language/mixtral/benchmark.py b/examples/language/mixtral/benchmark.py
index bb2a32d013f5..7c8a5fe658b2 100644
--- a/examples/language/mixtral/benchmark.py
+++ b/examples/language/mixtral/benchmark.py
@@ -11,6 +11,7 @@
 from model_utils import format_numel_str, get_model_numel
 from performance_evaluator import PerformanceEvaluator, get_profile_context
 from tqdm import tqdm
+from transformers import AutoConfig
 from transformers.models.mixtral import MixtralConfig, MixtralForCausalLM
 
 import colossalai
@@ -20,6 +21,7 @@
 from colossalai.cluster import DistCoordinator
 from colossalai.lazy import LazyInitContext
 from colossalai.nn.optimizer import HybridAdam
+from colossalai.pipeline.schedule.v_schedule import PipelineGraph
 from colossalai.shardformer import PipelineGradientCheckpointConfig
 
 warnings.filterwarnings("ignore")
@@ -85,7 +87,7 @@ def main():
     parser.add_argument("--zero", type=int, default=1, help="Zero Stage when hybrid plugin is enabled")
     parser.add_argument("--custom-ckpt", action="store_true", help="Customize checkpoint", default=False)
 
-    parser.add_argument("--pp_style", default="1f1b", choices=["1f1b", "interleaved"])
+    parser.add_argument("--pp_style", default="1f1b", choices=["1f1b", "interleaved", "zbv"])
     parser.add_argument("--n_chunks", default=1, help="number of model chunks", type=eval)
     parser.add_argument("--profile", action="store_true", help="Profile the code")
     parser.add_argument(
@@ -120,7 +122,7 @@ def main():
                 num_ckpt_layers_per_stage=[19, 19, 19, 13],
             ),
             "num_layers_per_stage": [19, 20, 20, 21],
-            "pp_style": "interleaved",
+            # "pp_style": "interleaved",
         }
         if args.custom_ckpt
         else {}
@@ -129,7 +131,29 @@ def main():
     # ==============================
     # Initialize Booster
     # ==============================
+    if args.config in MODEL_CONFIGS:
+        config = MODEL_CONFIGS[args.config]
+    else:
+        config = AutoConfig.from_pretrained(args.config, trust_remote_code=True)
+
     if args.plugin == "3d":
+        if args.pp_style == "zbv":
+            mem_f = 34 * config.hidden_size + 5 * config.num_attention_heads * args.max_length
+            mem_w = -32 * config.hidden_size
+            mem_b = -mem_w - mem_f
+            scheduler_nodes = PipelineGraph(
+                n_stage=args.pp,
+                n_micro=args.batch_size // args.mbs,
+                f_cost=1000,
+                b_cost=1000,
+                w_cost=1000,
+                c_cost=1,
+                f_mem=mem_f,
+                b_mem=mem_b,
+                w_mem=mem_w,
+            ).get_v_schedule()
+        else:
+            scheduler_nodes = None
         plugin = MoeHybridParallelPlugin(
             ep_size=args.ep,
             tp_size=args.tp,
@@ -148,6 +172,7 @@ def main():
             overlap_allgather=args.overlap_allgather,
             use_fp8=args.use_fp8,
             fp8_communication=args.use_fp8_comm,
+            scheduler_nodes=scheduler_nodes,
             **hybrid_kwargs,
         )
     else:

From b635dd06696f8c5deac29333ae85ce8cf68bd550 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Wed, 9 Oct 2024 14:05:26 +0800
Subject: [PATCH 104/167] fix

---
 colossalai/shardformer/layer/attn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 36a4ec963c1c..3853860c4636 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -442,7 +442,7 @@ def get_double_ring_groups(sp_group, tp_group, inner_ring_size=None):
             Tuple[dist.ProcessGroup, dist.ProcessGroup]: Inner-ring process group and inter-ring process group.
         """
         sp_size = dist.get_world_size(sp_group)
-        tp_size = dist.get_world_size(tp_group)
+        tp_size = dist.get_world_size(tp_group) if tp_group is not None else 1
         sp_rank = dist.get_rank(sp_group)
 
         if inner_ring_size is None:

From 9ee80fc828328901f95f8ed3015f3870249e2cff Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Thu, 10 Oct 2024 05:40:22 +0000
Subject: [PATCH 105/167] [fix] MixtralForCausalLMPolicy get_held_layer support
 zbv;

---
 colossalai/shardformer/modeling/mixtral.py |  1 +
 colossalai/shardformer/policies/mixtral.py | 14 ++++++++++++--
 examples/language/mixtral/benchmark.py     |  6 +++++-
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/colossalai/shardformer/modeling/mixtral.py b/colossalai/shardformer/modeling/mixtral.py
index 4f8ec162f60d..df9b91da2559 100644
--- a/colossalai/shardformer/modeling/mixtral.py
+++ b/colossalai/shardformer/modeling/mixtral.py
@@ -268,6 +268,7 @@ def mixtral_model_forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # retrieve input_ids and inputs_embeds
+        print(f"model_chunk_id {stage_manager.model_chunk_id} stage_manager {stage_manager.stage}")
         if stage_manager.is_first_stage():
             # retrieve input_ids and inputs_embeds
             if input_ids is not None and inputs_embeds is not None:
diff --git a/colossalai/shardformer/policies/mixtral.py b/colossalai/shardformer/policies/mixtral.py
index af5b15ed5d20..a8cd49dc17de 100644
--- a/colossalai/shardformer/policies/mixtral.py
+++ b/colossalai/shardformer/policies/mixtral.py
@@ -343,8 +343,18 @@ def get_held_layers(self) -> List[Module]:
         """Get pipeline layers for current stage."""
         stage_manager = self.pipeline_stage_manager
         held_layers = super().get_held_layers()
-        if stage_manager.is_last_stage():
-            held_layers.append(self.model.lm_head)
+        if stage_manager.is_interleave:
+            if stage_manager.use_zbv:
+                if stage_manager.is_first_stage(ignore_chunk=True):
+                    held_layers.append(self.model.lm_head)
+            else:
+                if stage_manager.is_last_stage(ignore_chunk=True):
+                    held_layers.append(self.model.lm_head)
+        else:
+            if stage_manager.is_last_stage():
+                held_layers.append(self.model.lm_head)
+        # if stage_manager.is_last_stage():
+        #     held_layers.append(self.model.lm_head)
         return held_layers
 
     def get_shared_params(self) -> List[Dict[int, Tensor]]:
diff --git a/examples/language/mixtral/benchmark.py b/examples/language/mixtral/benchmark.py
index 7c8a5fe658b2..2685afcedd6a 100644
--- a/examples/language/mixtral/benchmark.py
+++ b/examples/language/mixtral/benchmark.py
@@ -167,6 +167,7 @@ def main():
             enable_fused_normalization=torch.cuda.is_available(),
             enable_flash_attention=args.xformers,
             microbatch_size=args.mbs,
+            num_microbatches=args.batch_size // args.mbs,
             precision="bf16",
             enable_metadata_cache=not args.no_cache,
             overlap_allgather=args.overlap_allgather,
@@ -208,8 +209,10 @@ def main():
     with init_ctx:
         model = MixtralForCausalLM(config=config).to(torch.bfloat16)
 
+    # if args.grad_checkpoint:
+    #     model.gradient_checkpointing_enable()
     if args.grad_checkpoint:
-        model.gradient_checkpointing_enable()
+        model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
 
     model_numel = get_model_numel(model)
     coordinator.print_on_master(f"Model params: {format_numel_str(model_numel)}")
@@ -224,6 +227,7 @@ def main():
     )
 
     optimizer = HybridAdam(model.parameters())
+    # optimizer = torch.optim.SGD(model.parameters(), lr=1)
     torch.set_default_dtype(torch.bfloat16)
     model, optimizer, _, dataloader, _ = booster.boost(model, optimizer, dataloader=dataloader)
 

From 72b507a7beeb01f8407c3a6ea76d49bf9e75f040 Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Thu, 10 Oct 2024 06:19:51 +0000
Subject: [PATCH 106/167] [feat] update MixtralPipelineForwards -->
 mixtral_model_forward; support zbv;

---
 colossalai/shardformer/modeling/mixtral.py | 254 +++++++++++++++++----
 1 file changed, 212 insertions(+), 42 deletions(-)

diff --git a/colossalai/shardformer/modeling/mixtral.py b/colossalai/shardformer/modeling/mixtral.py
index df9b91da2559..d1e44aa5bebb 100644
--- a/colossalai/shardformer/modeling/mixtral.py
+++ b/colossalai/shardformer/modeling/mixtral.py
@@ -267,26 +267,98 @@ def mixtral_model_forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        # retrieve input_ids and inputs_embeds
-        print(f"model_chunk_id {stage_manager.model_chunk_id} stage_manager {stage_manager.stage}")
-        if stage_manager.is_first_stage():
-            # retrieve input_ids and inputs_embeds
-            if input_ids is not None and inputs_embeds is not None:
-                raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-            elif input_ids is not None:
-                batch_size, seq_length = input_ids.shape
-            elif inputs_embeds is not None:
-                batch_size, seq_length, _ = inputs_embeds.shape
+        if stage_manager.is_interleave:
+            if stage_manager.use_zbv:
+                # zbv
+                if stage_manager.is_first_stage(ignore_chunk=True) and stage_manager.model_chunk_id == 0:
+                    # retrieve input_ids and inputs_embeds
+                    if input_ids is not None and inputs_embeds is not None:
+                        raise ValueError(
+                            "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+                        )
+                    elif input_ids is not None:
+                        batch_size, seq_length = input_ids.shape
+                    elif inputs_embeds is not None:
+                        batch_size, seq_length, _ = inputs_embeds.shape
+                    else:
+                        raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+                    device = input_ids.device if input_ids is not None else inputs_embeds.device
+                    if inputs_embeds is None:
+                        inputs_embeds = self.embed_tokens(input_ids)
+                    hidden_states = inputs_embeds
+                else:
+                    input_shape = hidden_states.shape[:-1]
+                    batch_size, seq_length = input_shape
+                    device = hidden_states.device
             else:
-                raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            if inputs_embeds is None:
-                inputs_embeds = self.embed_tokens(input_ids)
-            hidden_states = inputs_embeds
+                # interleaved
+                if stage_manager.is_first_stage(ignore_chunk=True):
+                    # retrieve input_ids and inputs_embeds
+                    if input_ids is not None and inputs_embeds is not None:
+                        raise ValueError(
+                            "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+                        )
+                    elif input_ids is not None:
+                        batch_size, seq_length = input_ids.shape
+                    elif inputs_embeds is not None:
+                        batch_size, seq_length, _ = inputs_embeds.shape
+                    else:
+                        raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+                    device = input_ids.device if input_ids is not None else inputs_embeds.device
+                    if inputs_embeds is None:
+                        inputs_embeds = self.embed_tokens(input_ids)
+                    hidden_states = inputs_embeds
+                else:
+                    input_shape = hidden_states.shape[:-1]
+                    batch_size, seq_length = input_shape
+                    device = hidden_states.device
         else:
-            input_shape = hidden_states.shape[:-1]
-            batch_size, seq_length = input_shape
-            device = hidden_states.device
+            # 1f1b or None
+            if stage_manager.is_first_stage():  # No ignore_chunk=True for 1f1b
+                # retrieve input_ids and inputs_embeds
+                if input_ids is not None and inputs_embeds is not None:
+                    raise ValueError(
+                        "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+                    )
+                elif input_ids is not None:
+                    batch_size, seq_length = input_ids.shape
+                elif inputs_embeds is not None:
+                    batch_size, seq_length, _ = inputs_embeds.shape
+                else:
+                    raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+                device = input_ids.device if input_ids is not None else inputs_embeds.device
+                if inputs_embeds is None:
+                    inputs_embeds = self.embed_tokens(input_ids)
+                hidden_states = inputs_embeds
+            else:
+                input_shape = hidden_states.shape[:-1]
+                batch_size, seq_length = input_shape
+                device = hidden_states.device
+
+        #######
+        # Attention, we support consider 1f1b, interleaved, zbv
+        #######
+
+        # # retrieve input_ids and inputs_embeds
+        # print(f"model_chunk_id {stage_manager.model_chunk_id} stage_manager {stage_manager.stage}")
+        # if stage_manager.is_first_stage():
+        #     # retrieve input_ids and inputs_embeds
+        #     if input_ids is not None and inputs_embeds is not None:
+        #         raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        #     elif input_ids is not None:
+        #         batch_size, seq_length = input_ids.shape
+        #     elif inputs_embeds is not None:
+        #         batch_size, seq_length, _ = inputs_embeds.shape
+        #     else:
+        #         raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        #     device = input_ids.device if input_ids is not None else inputs_embeds.device
+        #     if inputs_embeds is None:
+        #         inputs_embeds = self.embed_tokens(input_ids)
+        #     hidden_states = inputs_embeds
+        # else:
+        #     input_shape = hidden_states.shape[:-1]
+        #     batch_size, seq_length = input_shape
+        #     device = hidden_states.device
 
         seq_length_with_past = seq_length
         past_key_values_length = 0
@@ -390,8 +462,22 @@ def custom_forward(*inputs):
             if output_router_logits:
                 all_router_logits += (layer_outputs[-1],)
 
-        if stage_manager.is_last_stage():
-            hidden_states = self.norm(hidden_states)
+        #######
+        # Attention, we support consider 1f1b, interleaved, zbv
+        #######
+        if stage_manager.is_interleave:
+            if stage_manager.use_zbv:
+                if stage_manager.is_first_stage(ignore_chunk=True) and stage_manager.model_chunk_id == 1:
+                    hidden_states = self.norm(hidden_states)
+            else:
+                if stage_manager.is_last_stage(ignore_chunk=True):
+                    hidden_states = self.norm(hidden_states)
+        else:
+            if stage_manager.is_last_stage():  # No ignore_chunk=True for 1f1b
+                hidden_states = self.norm(hidden_states)
+
+        # if stage_manager.is_last_stage():
+        #     hidden_states = self.norm(hidden_states)
 
         # add hidden states from the last decoder layer
         if output_hidden_states:
@@ -400,30 +486,114 @@ def custom_forward(*inputs):
 
         if output_router_logits and past_router_logits is not None:
             all_router_logits = past_router_logits + all_router_logits
-        if stage_manager.is_last_stage():
-            if not return_dict:
-                return tuple(
-                    v
-                    for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
-                    if v is not None
-                )
-            return MoeModelOutputWithPast(
-                last_hidden_state=hidden_states,
-                past_key_values=next_cache,
-                hidden_states=all_hidden_states,
-                attentions=all_self_attns,
-                router_logits=all_router_logits,
-            )
+
+        #######
+        # Attention, we support consider 1f1b, interleaved, zbv
+        #######
+        if stage_manager.is_interleave:
+            if stage_manager.use_zbv:
+                # zbv
+                if stage_manager.is_first_stage(ignore_chunk=True) and stage_manager.model_chunk_id == 1:
+                    if not return_dict:
+                        return tuple(
+                            v
+                            for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
+                            if v is not None
+                        )
+                    return MoeModelOutputWithPast(
+                        last_hidden_state=hidden_states,
+                        past_key_values=next_cache,
+                        hidden_states=all_hidden_states,
+                        attentions=all_self_attns,
+                        router_logits=all_router_logits,
+                    )
+                else:
+                    if output_router_logits:
+                        return {
+                            "hidden_states": hidden_states,
+                            "past_router_logits": all_router_logits,
+                        }
+                    else:
+                        return {
+                            "hidden_states": hidden_states,
+                        }
+            else:
+                # interlearved
+                if stage_manager.is_last_stage(ignore_chunk=True):
+                    if not return_dict:
+                        return tuple(
+                            v
+                            for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
+                            if v is not None
+                        )
+                    return MoeModelOutputWithPast(
+                        last_hidden_state=hidden_states,
+                        past_key_values=next_cache,
+                        hidden_states=all_hidden_states,
+                        attentions=all_self_attns,
+                        router_logits=all_router_logits,
+                    )
+                else:
+                    if output_router_logits:
+                        return {
+                            "hidden_states": hidden_states,
+                            "past_router_logits": all_router_logits,
+                        }
+                    else:
+                        return {
+                            "hidden_states": hidden_states,
+                        }
         else:
-            if output_router_logits:
-                return {
-                    "hidden_states": hidden_states,
-                    "past_router_logits": all_router_logits,
-                }
+            # 1f1b or other
+            if stage_manager.is_last_stage():
+                if not return_dict:
+                    return tuple(
+                        v
+                        for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
+                        if v is not None
+                    )
+                return MoeModelOutputWithPast(
+                    last_hidden_state=hidden_states,
+                    past_key_values=next_cache,
+                    hidden_states=all_hidden_states,
+                    attentions=all_self_attns,
+                    router_logits=all_router_logits,
+                )
             else:
-                return {
-                    "hidden_states": hidden_states,
-                }
+                if output_router_logits:
+                    return {
+                        "hidden_states": hidden_states,
+                        "past_router_logits": all_router_logits,
+                    }
+                else:
+                    return {
+                        "hidden_states": hidden_states,
+                    }
+
+        # if stage_manager.is_last_stage():
+        #     if not return_dict:
+        #         return tuple(
+        #             v
+        #             for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
+        #             if v is not None
+        #         )
+        #     return MoeModelOutputWithPast(
+        #         last_hidden_state=hidden_states,
+        #         past_key_values=next_cache,
+        #         hidden_states=all_hidden_states,
+        #         attentions=all_self_attns,
+        #         router_logits=all_router_logits,
+        #     )
+        # else:
+        #     if output_router_logits:
+        #         return {
+        #             "hidden_states": hidden_states,
+        #             "past_router_logits": all_router_logits,
+        #         }
+        #     else:
+        #         return {
+        #             "hidden_states": hidden_states,
+        #         }
 
     @staticmethod
     def mixtral_for_causal_lm_forward(

From 646b3c5a90ce904b2128cae467c2068f435a9df0 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Thu, 10 Oct 2024 14:34:45 +0800
Subject: [PATCH 107/167] [shardformer] fix linear 1d row and support uneven
 splits for fused qkv linear (#6084)

* [tp] hotfix linear row

* [tp] support uneven split for fused linear

* [tp] support sp for fused linear

* [tp] fix gpt2 mlp policy

* [tp] fix gather fused and add fused linear row
---
 .../modeling/policy/nopadding_baichuan.py     |   4 +-
 colossalai/shardformer/layer/__init__.py      |   3 +-
 colossalai/shardformer/layer/_operation.py    |   4 +-
 colossalai/shardformer/layer/linear.py        |  11 +-
 .../shardformer/layer/qkv_fused_linear.py     | 364 +++++++++++++++---
 colossalai/shardformer/policies/blip2.py      |   2 +-
 colossalai/shardformer/policies/gpt2.py       |   4 +-
 colossalai/shardformer/policies/sam.py        |   2 +-
 .../test_gpt2_qkv_fused_linear_1d.py          |  21 +-
 .../test_layer/test_qkv_fused_linear_1d.py    | 141 ++++---
 10 files changed, 399 insertions(+), 157 deletions(-)

diff --git a/colossalai/inference/modeling/policy/nopadding_baichuan.py b/colossalai/inference/modeling/policy/nopadding_baichuan.py
index 37b5062e887e..8528de75cb28 100644
--- a/colossalai/inference/modeling/policy/nopadding_baichuan.py
+++ b/colossalai/inference/modeling/policy/nopadding_baichuan.py
@@ -57,7 +57,9 @@ def module_policy(self):
                         target_module=NopadBaichuanMLP,
                     ),
                     SubModuleReplacementDescription(
-                        suffix="self_attn.W_pack", target_module=FusedLinear1D_Col, kwargs={"n_fused": 3}
+                        suffix="self_attn.W_pack",
+                        target_module=FusedLinear1D_Col,
+                        kwargs={"split_sizes": [self.model.config.hidden_size] * 3},
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.o_proj",
diff --git a/colossalai/shardformer/layer/__init__.py b/colossalai/shardformer/layer/__init__.py
index 8882a33c15e6..684993de697e 100644
--- a/colossalai/shardformer/layer/__init__.py
+++ b/colossalai/shardformer/layer/__init__.py
@@ -6,7 +6,7 @@
 from .loss import cross_entropy_1d, dist_cross_entropy
 from .normalization import FusedLayerNorm, FusedRMSNorm, LayerNorm, RMSNorm
 from .parallel_module import ParallelModule
-from .qkv_fused_linear import FusedLinear1D_Col, GPT2FusedLinearConv1D_Col, GPT2FusedLinearConv1D_Row
+from .qkv_fused_linear import FusedLinear1D_Col, FusedLinear1D_Row, GPT2FusedLinearConv1D_Col, GPT2FusedLinearConv1D_Row
 
 __all__ = [
     "Embedding1D",
@@ -34,4 +34,5 @@
     "RingAttention",
     "get_pad_info",
     "all_to_all_comm",
+    "FusedLinear1D_Row",
 ]
diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index aec82356747a..1d7a1f1042cc 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -840,7 +840,7 @@ def forward(ctx, input_, process_group, scatter_dim, gather_dim, fp8_communicati
         ctx.gather_dim = gather_dim
         ctx.fp8_communication = fp8_communication
         world_size = dist.get_world_size(process_group)
-        bsz, _, _ = input_.shape
+        bsz = input_.shape[0]
 
         # using all_to_all_single when batch size is 1
         if bsz == 1:
@@ -871,7 +871,7 @@ def backward(ctx, grad_output):
         gather_dim = ctx.scatter_dim
         fp8_communication = ctx.fp8_communication
         world_size = dist.get_world_size(process_group)
-        bsz, _, _ = grad_output.shape
+        bsz = grad_output.shape[0]
 
         if bsz == 1:
             return_grad = _all_to_all_single(
diff --git a/colossalai/shardformer/layer/linear.py b/colossalai/shardformer/layer/linear.py
index d77dd496592f..52b0e79c64e0 100644
--- a/colossalai/shardformer/layer/linear.py
+++ b/colossalai/shardformer/layer/linear.py
@@ -428,11 +428,8 @@ def forward(self, input_: Tensor) -> Tensor:
                     handle.wait()
                 output = torch.cat(output_parallel_list, dim=-1)
         else:
-            if self.seq_parallel_mode is None:
-                output_parallel = linear_with_async_comm(input_, self.weight, None, self.process_group, False)
-                output = reduce_forward(output_parallel, self.process_group, fp8_communication=self.fp8_communication)
-            elif self.seq_parallel_mode == "split_gather":
-                output_parallel = linear_with_async_comm(input_, self.weight, None, self.process_group, False)
+            if self.seq_parallel_mode == "split_gather":
+                output_parallel = F.linear(input_, self.weight)
                 output = reducescatter_forward_gather_backward(
                     output_parallel, self.process_group, self.seq_parallel_dim, fp8_communication=self.fp8_communication
                 )
@@ -445,8 +442,8 @@ def forward(self, input_: Tensor) -> Tensor:
                     ring=True,
                 )
             else:
-                output_parallel = linear_with_async_comm(input_, self.weight, None, self.process_group, False)
-                output = reduce_forward(output_parallel, self.process_group)
+                output_parallel = F.linear(input_, self.weight)
+                output = reduce_forward(output_parallel, self.process_group, fp8_communication=self.fp8_communication)
 
         if not self.skip_bias_add:
             if self.bias is not None:
diff --git a/colossalai/shardformer/layer/qkv_fused_linear.py b/colossalai/shardformer/layer/qkv_fused_linear.py
index 6fd689908af0..a1e25ff3a33c 100644
--- a/colossalai/shardformer/layer/qkv_fused_linear.py
+++ b/colossalai/shardformer/layer/qkv_fused_linear.py
@@ -7,6 +7,7 @@
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+import torch.nn.functional as F
 from torch import Tensor
 from torch.distributed import ProcessGroup
 from torch.nn.parameter import Parameter
@@ -24,7 +25,9 @@
 )
 
 from ._operation import (
-    gather_forward_split_backward,
+    gather_forward_reducescatter_backward,
+    linear_gather_forward_reducescatter_backward,
+    linear_reducescatter_forward_gather_backward,
     linear_with_async_comm,
     matmul_gather_forward_reducescatter_backward,
     matmul_with_async_comm,
@@ -44,21 +47,25 @@
 
 
 def split_fused_qkv_in_gpt2_style(
-    qkv: torch.Tensor, n_fused: int, process_group: ProcessGroup, is_transposed: bool = False
+    qkv: torch.Tensor, split_sizes: List[int], process_group: ProcessGroup, is_transposed: bool = False
 ):
     """
     The fused qkv tensor looks like [Q1, Q2, K1, K2, V1, V2], this function will split them into [Q1, K1, V1] and [Q2, K2, V2].
 
     Args:
         qkv (torch.Tensor): The fused qkv tensor.
-        n_fused (int): The number items fused together, defaults to 3 (query, key and value).
+        split_sizes (List[int]): The sizes of the split tensor.
         process_group (ProcessGroup): The process group for distributed communication.
         is_transposed (bool): generally the tensor is the shape of (out_features, in_features). Set this to True if the tensor is in the shape (in_features, out_features).
     """
     # get the number of slice for the fused qkv
     rank = dist.get_rank(group=process_group)
     world_size = dist.get_world_size(group=process_group)
-    order = torch.arange(world_size * n_fused)
+    order = torch.arange(world_size * len(split_sizes))
+    new_split_sizes = []
+    for sz in split_sizes:
+        assert sz % world_size == 0, f"size {sz} is not divisible by world_size {world_size}"
+        new_split_sizes.extend([sz // world_size] * world_size)
 
     # split the fused qkv
     # from
@@ -66,9 +73,9 @@ def split_fused_qkv_in_gpt2_style(
     # to
     # [Q1, Q2, K1, K2, V1, V2]
     if is_transposed:
-        weight_chunks = torch.chunk(qkv, world_size * n_fused, dim=-1)
+        weight_chunks = torch.split(qkv, new_split_sizes, dim=-1)
     else:
-        weight_chunks = torch.chunk(qkv, world_size * n_fused, dim=0)
+        weight_chunks = torch.split(qkv, new_split_sizes, dim=0)
 
     # rearrange the slice into the final order
     # from
@@ -85,18 +92,23 @@ def split_fused_qkv_in_gpt2_style(
 
 
 def gather_fused_qkv_in_gpt2_style(
-    qkv: torch.Tensor, n_fused: int, process_group: ProcessGroup, is_transposed: bool = False
+    qkv: torch.Tensor, split_sizes: List[int], process_group: ProcessGroup, is_transposed: bool = False
 ):
     """
     The splitted qkv tensor looks like [Q1, K1, V1] and [Q2, K2, V2], this function will gather them into [Q1, Q2, K1, K2, V1, V2].
 
     Args:
         qkv (torch.Tensor): The fused qkv tensor.
-        n_fused (int): The number items fused together, defaults to 3 (query, key and value).
+        split_sizes (List[int]): The sizes of the split tensor.
         process_group (ProcessGroup): The process group for distributed communication.
         is_transposed (bool): generally the tensor is the shape of (out_features, in_features). Set this to True if the tensor is in the shape (in_features, out_features).
     """
     world_size = dist.get_world_size(group=process_group)
+    new_split_sizes = []
+    for sz in split_sizes:
+        assert sz % world_size == 0, f"size {sz} is not divisible by world_size {world_size}"
+        new_split_sizes.append(sz // world_size)
+    new_split_sizes = new_split_sizes * world_size
 
     # gather the tensors
     # from
@@ -121,13 +133,13 @@ def gather_fused_qkv_in_gpt2_style(
     # to
     # [Q1, Q2, K1, K2, V1, V2]
     if is_transposed:
-        weight_chunks = torch.chunk(gather_weight, world_size * n_fused, dim=-1)
+        weight_chunks = torch.split(gather_weight, new_split_sizes, dim=-1)
     else:
-        weight_chunks = torch.chunk(gather_weight, world_size * n_fused, dim=0)
+        weight_chunks = torch.split(gather_weight, new_split_sizes, dim=0)
 
     reordered_chunk_list = []
-    for i in range(n_fused):
-        reordered_chunk_list.extend(weight_chunks[i::n_fused])
+    for i in range(len(split_sizes)):
+        reordered_chunk_list.extend(weight_chunks[i :: len(split_sizes)])
 
     if is_transposed:
         reordered_gather_weight = torch.cat(reordered_chunk_list, dim=-1)
@@ -136,6 +148,42 @@ def gather_fused_qkv_in_gpt2_style(
     return reordered_gather_weight
 
 
+class _SplitForwardGatherBackwardFusedQKV(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, qkv: torch.Tensor, split_sizes: List[int], process_group: ProcessGroup):
+        ctx.split_sizes = split_sizes
+        ctx.process_group = process_group
+        return split_fused_qkv_in_gpt2_style(qkv, split_sizes, process_group, is_transposed=True)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_output = gather_fused_qkv_in_gpt2_style(
+            grad_output, ctx.split_sizes, ctx.process_group, is_transposed=True
+        )
+        return grad_output, None, None
+
+
+def split_forward_gather_backward_fused_qkv(qkv: torch.Tensor, split_sizes: List[int], process_group: ProcessGroup):
+    return _SplitForwardGatherBackwardFusedQKV.apply(qkv, split_sizes, process_group)
+
+
+class _GatherForwardSplitBackwardFusedQKV(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, qkv: torch.Tensor, split_sizes: List[int], process_group: ProcessGroup):
+        ctx.split_sizes = split_sizes
+        ctx.process_group = process_group
+        return gather_fused_qkv_in_gpt2_style(qkv, split_sizes, process_group, is_transposed=True)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_output = split_fused_qkv_in_gpt2_style(grad_output, ctx.split_sizes, ctx.process_group, is_transposed=True)
+        return grad_output, None, None
+
+
+def gather_forward_split_backward_fused_qkv(qkv: torch.Tensor, split_sizes: List[int], process_group: ProcessGroup):
+    return _GatherForwardSplitBackwardFusedQKV.apply(qkv, split_sizes, process_group)
+
+
 class GPT2FusedLinearConv1D_Col(ParallelModule):
     r"""Linear layer with column parallelism.
 
@@ -145,10 +193,10 @@ class GPT2FusedLinearConv1D_Col(ParallelModule):
     Args:
         in_features (int): size of each input sample.
         out_features (int): size of each output sample.
+        split_sizes (List[int]): The sizes of the split tensor.
         bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
         dtype (`torch.dtype`): The dtype of parameters, defaults to None.
         device (`torch.device`): The device of parameters, defaults to None.
-        n_fused (int): The number items fused, defaults to 3 (QKV).
         process_group (`torch.distributed.ProcessGroup`): The process group to be used for weight sharding and communication, defaults to None.
         seq_parallel_mode (str): If set to ``None``, it will not use sequence parallel, otherwise will use corresponding mode of sequence parallel, defaults to None.
         gather_output (bool, optional): If true, call all-gather on output and make Y available
@@ -169,6 +217,7 @@ def __init__(
         self,
         in_features: int,
         out_features: int,
+        split_sizes: List[int],
         bias: bool = True,
         dtype: torch.dtype = None,
         device: torch.device = None,
@@ -178,7 +227,6 @@ def __init__(
         seq_parallel_mode: str = None,
         overlap: bool = False,
         skip_bias_add: bool = False,
-        n_fused: int = 3,
         weight: Optional[Parameter] = None,
         bias_: Optional[Parameter] = None,
         weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
@@ -195,11 +243,15 @@ def __init__(
         self.overlap = overlap
         self.skip_bias_add = skip_bias_add
         self.device = device
-        self.n_fused = n_fused
+        self.split_sizes = split_sizes
         self.process_group = process_group
         self.async_communication = async_communication
         self.fp8_communication = fp8_communication
 
+        assert (
+            sum(split_sizes) == out_features
+        ), f"The sum of split_sizes({sum(split_sizes)}) should be equal to out_features({out_features})."
+
         if skip_bias_add and not bias:
             raise ValueError("cannot skip bias addition if bias is None")
 
@@ -223,10 +275,10 @@ def __init__(
             self.weight = weight
 
         def shard_fn(tensor):
-            return split_fused_qkv_in_gpt2_style(tensor, self.n_fused, self.process_group, True)
+            return split_fused_qkv_in_gpt2_style(tensor, self.split_sizes, self.process_group, True)
 
         def gather_fn(tensor):
-            return gather_fused_qkv_in_gpt2_style(tensor, self.n_fused, self.process_group, True)
+            return gather_fused_qkv_in_gpt2_style(tensor, self.split_sizes, self.process_group, True)
 
         if not is_customized_distributed_tensor(self.weight):
             with torch.no_grad():
@@ -252,7 +304,11 @@ def gather_fn(tensor):
 
     @staticmethod
     def from_native_module(
-        module: nn.Module, process_group: Union[ProcessGroup, List[ProcessGroup]], *args, **kwargs
+        module: nn.Module,
+        process_group: Union[ProcessGroup, List[ProcessGroup]],
+        split_sizes: List[int],
+        *args,
+        **kwargs,
     ) -> ParallelModule:
         r"""
         Convert a huggingface layer `Conv1D` in gpt2 to a parallelized linear layer.
@@ -260,7 +316,7 @@ def from_native_module(
         Args:
             module (`nn.Linear`): The module to be converted.
             process_group (`Union[ProcessGroup, List[ProcessGroup]]`): The process group to be used for weight sharding and communication.
-            n_fused (int): The number of layers to be fused. In GPT2, Q,K,V are fused in one weight.
+            split_sizes (List[int]): The sizes of the split tensor. In GPT2, Q,K,V are fused in one weight.
         """
         LazyInitContext.materialize(module)
         # get the attributes
@@ -291,6 +347,7 @@ def from_native_module(
             process_group=process_group,
             weight=module.weight,
             bias_=module.bias,
+            split_sizes=split_sizes,
             *args,
             **kwargs,
         )
@@ -354,9 +411,7 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
 
         if self.gather_output:
             # All-gather across the partitions.
-            output = gather_forward_split_backward(
-                output_parallel, dim=-1, process_group=self.process_group, fp8_communication=self.fp8_communication
-            )
+            output = gather_forward_split_backward_fused_qkv(output_parallel, self.split_sizes, self.process_group)
         else:
             output = output_parallel
 
@@ -605,10 +660,10 @@ class FusedLinear1D_Col(ParallelModule):
     Args:
         in_features (int): size of each input sample.
         out_features (int): size of each output sample.
+        split_sizes (List[int]): The sizes of the split tensor.
         bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
         dtype (`torch.dtype`): The dtype of parameters, defaults to None.
         device (`torch.device`): The device of parameters, defaults to None.
-        n_fused (int): The number items fused, defaults to 3 (QKV).
         process_group (`torch.distributed.ProcessGroup`): The process group to be used for weight sharding and communication, defaults to None.
         gather_output (bool, optional): If true, call all-gather on output and make Y available
                     to all GPUs, otherwise, every GPU will have its output
@@ -628,14 +683,16 @@ def __init__(
         self,
         in_features: int,
         out_features: int,
+        split_sizes: List[int],
         bias: bool = True,
         dtype: torch.dtype = None,
         device: torch.device = None,
         process_group: ProcessGroup = None,
-        async_communication: bool = False,
         gather_output: bool = False,
+        seq_parallel_mode: str = None,
+        seq_parallel_dim: int = 1,
+        overlap: torch.cuda.Stream = None,
         skip_bias_add: bool = False,
-        n_fused: int = 3,
         weight: Optional[Parameter] = None,
         bias_: Optional[Parameter] = None,
         weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
@@ -647,13 +704,19 @@ def __init__(
         self.in_features = in_features
         self.out_features = out_features
         self.gather_output = gather_output
+        self.seq_parallel_mode = seq_parallel_mode
+        self.seq_parallel_dim = seq_parallel_dim
+        self.overlap = overlap
         self.skip_bias_add = skip_bias_add
         self.device = device
-        self.n_fused = n_fused
+        self.split_sizes = split_sizes
         self.process_group = process_group
-        self.async_communication = async_communication
         self.fp8_communication = fp8_communication
 
+        assert (
+            sum(split_sizes) == out_features
+        ), f"The sum of split_sizes({sum(split_sizes)}) should be equal to out_features({out_features})."
+
         if skip_bias_add and not bias:
             raise ValueError("cannot skip bias addition if bias is None")
 
@@ -677,10 +740,10 @@ def __init__(
             self.weight = weight
 
         def shard_fn(tensor):
-            return split_fused_qkv_in_gpt2_style(tensor, self.n_fused, self.process_group, False)
+            return split_fused_qkv_in_gpt2_style(tensor, self.split_sizes, self.process_group, False)
 
         def gather_fn(tensor):
-            return gather_fused_qkv_in_gpt2_style(tensor, self.n_fused, self.process_group, False)
+            return gather_fused_qkv_in_gpt2_style(tensor, self.split_sizes, self.process_group, False)
 
         if not is_customized_distributed_tensor(self.weight):
             with torch.no_grad():
@@ -706,7 +769,11 @@ def gather_fn(tensor):
 
     @staticmethod
     def from_native_module(
-        module: nn.Module, process_group: Union[ProcessGroup, List[ProcessGroup]], n_fused: int, *args, **kwargs
+        module: nn.Module,
+        process_group: Union[ProcessGroup, List[ProcessGroup]],
+        split_sizes: List[int],
+        *args,
+        **kwargs,
     ) -> ParallelModule:
         r"""
         Convert a fused `torch.nn.linear` layer to a parallelized linear layer.
@@ -714,7 +781,7 @@ def from_native_module(
         Args:
             module (`nn.Linear`): The module to be converted.
             process_group (`Union[ProcessGroup, List[ProcessGroup]]`): The process group to be used for weight sharding and communication.
-            n_fused (int): The number of layers to be fused. In common, Q,K,V are fused in one weight.
+            split_sizes (List[int]): The sizes of the split tensor. In common, Q,K,V are fused in one weight.
         """
         LazyInitContext.materialize(module)
 
@@ -737,25 +804,11 @@ def from_native_module(
             process_group=process_group,
             weight=module.weight,
             bias_=module.bias,
-            n_fused=n_fused,
+            split_sizes=split_sizes,
             *args,
             **kwargs,
         )
 
-        # # TODO: copy the sharded weights
-        # with torch.no_grad():
-        #     sharded_weight = split_fused_qkv_in_gpt2_style(module.weight.data,
-        #                                                    n_fused=n_fused,
-        #                                                    process_group=process_group,
-        #                                                    is_transposed=False)
-        #     linear_1d.weight.data.copy_(sharded_weight.data)
-
-        #     if bias:
-        #         sharded_bias = split_fused_qkv_in_gpt2_style(module.bias.data,
-        #                                                      n_fused=n_fused,
-        #                                                      process_group=process_group,
-        #                                                      is_transposed=False)
-        #         linear_1d.bias.data.copy_(sharded_bias.data)
         return linear_1d
 
     def reset_parameters(self, weight_initializer, bias_initializer) -> None:
@@ -772,19 +825,30 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
             input_.shape, self.weight.shape, self.weight.shape[-1]
         )
         # Set up backprop all-reduce.
-        # input_parallel = reduce_backward(input_, self.process_group)
         input_parallel = input_
 
         # Matrix multiply.
         bias = self.bias if not self.skip_bias_add else None
 
-        output_parallel = linear_with_async_comm(input_parallel, self.weight, bias, self.process_group, True)
+        if self.seq_parallel_mode == "split_gather":
+            input_parallel = gather_forward_reducescatter_backward(
+                input_parallel, self.process_group, self.seq_parallel_dim, fp8_communication=self.fp8_communication
+            )
+            output_parallel = linear_with_async_comm(
+                input_parallel, self.weight, bias, self.process_group, False, fp8_communication=self.fp8_communication
+            )
+        elif self.seq_parallel_mode == "ring":
+            output_parallel = linear_gather_forward_reducescatter_backward(
+                input_parallel, self.weight, bias, self.process_group, True, self.seq_parallel_dim, self.overlap, True
+            )
+        else:
+            output_parallel = linear_with_async_comm(
+                input_parallel, self.weight, bias, self.process_group, True, fp8_communication=self.fp8_communication
+            )
 
         if self.gather_output:
             # All-gather across the partitions.
-            output = gather_forward_split_backward(
-                output_parallel, dim=-1, process_group=self.process_group, fp8_communication=self.fp8_communication
-            )
+            output = gather_forward_split_backward_fused_qkv(output_parallel, self.split_sizes, self.process_group)
         else:
             output = output_parallel
 
@@ -792,3 +856,201 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
             return output, self.bias
         else:
             return output
+
+
+class FusedLinear1D_Row(ParallelModule):
+    r"""Linear layer with row parallelism
+
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (`torch.dtype`): The dtype of parameters, defaults to None.
+        parallel_input (bool): If set to ``True``, it's assumed that the input is split, defaults to False.
+        process_group (`torch.distributed.ProcessGroup`): The process group to be used for weight sharding and communication, defaults to None.
+        seq_parallel_mode (`str`): The type of sp mode, it will use sequence parallel when `seq_parallel_mode` is not None. Defaults to None.
+        seq_parallel_dim (`int`): Which dim will sequence parallelism split and gather the sequence.
+        skip_bias_add (bool): If set to ``True``, it will skip bias add for linear layer,
+            which is preserved for kernel fusion, defaults to False
+        weight_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    More details about ``initializer`` please refer to
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        split_sizes: List[int],
+        bias: bool = True,
+        dtype: torch.dtype = None,
+        device: torch.device = None,
+        process_group: ProcessGroup = None,
+        seq_parallel_mode: str = None,
+        seq_parallel_dim: int = 1,
+        parallel_input: bool = True,
+        skip_bias_add: bool = False,
+        weight: Optional[Parameter] = None,
+        bias_: Optional[Parameter] = None,
+        weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+        bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
+        fp8_communication: bool = False,
+    ):
+        super().__init__()
+        # Keep input parameters
+        self.in_features = in_features
+        self.out_features = out_features
+        self.split_sizes = split_sizes
+        self.parallel_input = parallel_input
+        self.skip_bias_add = skip_bias_add
+        self.process_group = process_group
+        self.seq_parallel_mode = seq_parallel_mode
+        self.seq_parallel_dim = seq_parallel_dim
+        self.num_partitions = dist.get_world_size(self.process_group)
+        self.fp8_communication = fp8_communication
+
+        assert (
+            sum(split_sizes) == in_features
+        ), f"The sum of split_sizes({sum(split_sizes)}) should be equal to in_features({in_features})."
+
+        if skip_bias_add and not bias:
+            raise ValueError("cannot skip bias addition if bias is None")
+
+        # offset the seed with randomizer index and rank
+        seed = torch.random.initial_seed()
+        self.randomizer = create_randomizer_with_offset(seed, process_group=self.process_group)
+
+        # sanity check
+        if weight is not None:
+            assert not bias or bias_ is not None, "bias_ must be provided if bias is True when weight is not None"
+        else:
+            assert bias_ is None, "bias_ must be None if weight is None"
+
+        # Parameters.
+        if weight is None:
+            # Initialize weight.
+            factory_kwargs = {"device": device, "dtype": dtype}
+            self.weight = Parameter(torch.empty(self.out_features, self.in_features, **factory_kwargs))
+        else:
+            weight.data = weight.data.to(device=device, dtype=dtype)
+            self.weight = weight
+
+        def shard_fn(tensor):
+            return split_fused_qkv_in_gpt2_style(tensor, self.split_sizes, self.process_group, True)
+
+        def gather_fn(tensor):
+            return gather_fused_qkv_in_gpt2_style(tensor, self.split_sizes, self.process_group, True)
+
+        if not is_customized_distributed_tensor(self.weight):
+            with torch.no_grad():
+                sharded_weight = distribute_tensor_with_customization(self.weight.data, shard_fn, gather_fn)
+            customized_distributed_tensor_to_existing_param(sharded_weight, self.weight)
+
+        if bias:
+            if bias_ is None:
+                self.bias = Parameter(torch.empty(self.out_features, **factory_kwargs))
+            else:
+                bias_.data = bias_.data.to(device=device, dtype=dtype)
+                self.bias = bias_
+        else:
+            self.bias = None
+
+        if weight is None:
+            with self.randomizer.fork_rng(enable_cpu=True):
+                self.reset_parameters(weight_initializer, bias_initializer)
+
+    @staticmethod
+    def from_native_module(
+        module: nn.Module, process_group: Union[ProcessGroup, List[ProcessGroup]], split_sizes: List[int], **kwargs
+    ) -> ParallelModule:
+        r"""
+        Convert a native PyTorch linear layer to a parallelized linear layer.
+        """
+        LazyInitContext.materialize(module)
+        # get the attributes
+        in_features = module.in_features
+        out_features = module.out_features
+        bias = module.bias is not None
+        device = module.weight.device
+
+        # ensure only one process group is passed
+        if isinstance(process_group, (list, tuple)):
+            assert len(process_group) == 1, f"Expected only one process group, got {len(process_group)}."
+            process_group = process_group[0]
+
+        linear_1d = FusedLinear1D_Row(
+            in_features=in_features,
+            out_features=out_features,
+            bias=bias,
+            device=device,
+            process_group=process_group,
+            weight=module.weight,
+            bias_=module.bias,
+            split_sizes=split_sizes,
+            **kwargs,
+        )
+
+        return linear_1d
+
+    @torch.no_grad()
+    def reset_parameters(self, weight_initializer, bias_initializer) -> None:
+        fan_in, fan_out = self.in_features, self.out_features
+        weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
+
+        if self.bias is not None:
+            bias_initializer(self.bias, fan_in=fan_in)
+            if self.process_group is None:
+                src_rank = 0
+            else:
+                src_rank = dist.distributed_c10d._get_global_rank(self.process_group, 0)
+
+            origin_device = self.bias.device
+            bias = self.bias.cuda()
+            dist.broadcast(bias, src=src_rank, group=self.process_group)
+            bias = bias.to(origin_device)
+            self.bias.copy_(bias)
+
+    def forward(self, input_: Tensor) -> Tensor:
+        # Set up backprop all-reduce.
+        if self.parallel_input:
+            assert (
+                input_.shape[-1] == self.weight.shape[-1]
+            ), "Invalid shapes in Linear1D_Row forward: input={}, weight={}. Expected last dim of input {}.".format(
+                input_.shape, self.weight.shape, self.weight.shape[-1]
+            )
+            input_ = input_
+        else:
+            assert (
+                divide(input_.shape[-1], self.num_partitions) == self.weight.shape[-1]
+            ), "Invalid shapes in Linear1D_Row forward: input={}, weight={}. Expected last dim of input {}.".format(
+                input_.shape, self.weight.shape, self.weight.shape[-1] * self.num_partitions
+            )
+            input_ = split_forward_gather_backward_fused_qkv(input_, self.split_sizes, self.process_group)
+
+        if self.seq_parallel_mode == "split_gather":
+            output_parallel = F.linear(input_, self.weight)
+            output = reducescatter_forward_gather_backward(
+                output_parallel, self.process_group, self.seq_parallel_dim, fp8_communication=self.fp8_communication
+            )
+        elif self.seq_parallel_mode == "ring":
+            output = linear_reducescatter_forward_gather_backward(
+                input_,
+                self.weight,
+                process_group=self.process_group,
+                dim=self.seq_parallel_dim,
+                ring=True,
+            )
+        else:
+            output_parallel = F.linear(input_, self.weight)
+            output = reduce_forward(output_parallel, self.process_group, fp8_communication=self.fp8_communication)
+
+        if not self.skip_bias_add:
+            if self.bias is not None:
+                output = output + self.bias
+            return output
+        else:
+            return output, self.bias
diff --git a/colossalai/shardformer/policies/blip2.py b/colossalai/shardformer/policies/blip2.py
index da798f6a0521..2e73d5c2a637 100644
--- a/colossalai/shardformer/policies/blip2.py
+++ b/colossalai/shardformer/policies/blip2.py
@@ -71,7 +71,7 @@ def module_policy(self):
                         suffix="self_attn.qkv",
                         target_module=col_nn.FusedLinear1D_Col,
                         kwargs={
-                            "n_fused": 3,
+                            "split_sizes": [self.model.config.vision_config.hidden_size] * 3,
                             "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
diff --git a/colossalai/shardformer/policies/gpt2.py b/colossalai/shardformer/policies/gpt2.py
index d9233be9a822..faacf91b265a 100644
--- a/colossalai/shardformer/policies/gpt2.py
+++ b/colossalai/shardformer/policies/gpt2.py
@@ -92,7 +92,7 @@ def module_policy(self):
                         suffix="attn.c_attn",
                         target_module=col_nn.GPT2FusedLinearConv1D_Col,
                         kwargs={
-                            "n_fused": 3,
+                            "split_sizes": [self.model.config.hidden_size] * 3,
                             "seq_parallel_mode": sp_mode,
                             "overlap": overlap,
                             "fp8_communication": self.shard_config.fp8_communication,
@@ -107,7 +107,7 @@ def module_policy(self):
                         suffix="mlp.c_fc",
                         target_module=col_nn.GPT2FusedLinearConv1D_Col,
                         kwargs={
-                            "n_fused": 1,
+                            "split_sizes": [self.model.config.n_inner or 4 * self.model.config.hidden_size],
                             "seq_parallel_mode": sp_mode,
                             "overlap": overlap,
                             "skip_bias_add": self.enable_bias_gelu_fused,
diff --git a/colossalai/shardformer/policies/sam.py b/colossalai/shardformer/policies/sam.py
index 674fe5e58799..a94cc9119356 100644
--- a/colossalai/shardformer/policies/sam.py
+++ b/colossalai/shardformer/policies/sam.py
@@ -42,7 +42,7 @@ def module_policy(self):
                         suffix="attn.qkv",
                         target_module=col_nn.FusedLinear1D_Col,
                         kwargs={
-                            "n_fused": 3,
+                            "split_sizes": [self.model.config.vision_config.hidden_size] * 3,
                             "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
diff --git a/tests/test_shardformer/test_layer/test_gpt2_qkv_fused_linear_1d.py b/tests/test_shardformer/test_layer/test_gpt2_qkv_fused_linear_1d.py
index 5aa8584a0092..923075e0e83d 100644
--- a/tests/test_shardformer/test_layer/test_gpt2_qkv_fused_linear_1d.py
+++ b/tests/test_shardformer/test_layer/test_gpt2_qkv_fused_linear_1d.py
@@ -41,21 +41,6 @@ def forward(self, x):
         return x
 
 
-def rearrange(tensor: torch.Tensor, dim: int):
-    tensor = tensor.clone()
-    world_size = 2
-    order = torch.arange(world_size * 3)
-    new_order = []
-    for i in range(world_size):
-        new_order.append(order[i::world_size])
-    new_order = torch.cat(new_order)
-
-    tensor_chunks = torch.chunk(tensor, world_size * 3, dim=dim)
-    rearanged_tensor_chunks = [tensor_chunks[i] for i in new_order]
-    rearanged_tensor = torch.cat(rearanged_tensor_chunks, dim=dim)
-    return rearanged_tensor
-
-
 def check_linear_conv_1d_col(lazy_init: bool, seq_parallel_mode: str, overlap: bool):
     ctx = LazyInitContext() if lazy_init else nullcontext()
     linear = Conv1D(192, 48).cuda()
@@ -66,7 +51,7 @@ def check_linear_conv_1d_col(lazy_init: bool, seq_parallel_mode: str, overlap: b
         process_group=None,
         gather_output=True,
         seq_parallel_mode=seq_parallel_mode,
-        n_fused=3,
+        split_sizes=[64] * 3,
         overlap=overlap,
     )
 
@@ -88,13 +73,13 @@ def check_linear_conv_1d_col(lazy_init: bool, seq_parallel_mode: str, overlap: b
         x.expand_as(x.clone()) if seq_parallel_mode is None else torch.chunk(x.clone(), 2, dim=1)[dist.get_rank()]
     )
     gather_out = linear_conv_col(x_for_shard)
-    assert_close(rearrange(out, -1), gather_out)
+    assert_close(out, gather_out)
 
     # check backward correctness
     out.sum().backward()
     gather_out.sum().backward()
 
-    target_grad = split_fused_qkv_in_gpt2_style(linear.weight.grad, 3, None, True)
+    target_grad = split_fused_qkv_in_gpt2_style(linear.weight.grad, [64] * 3, None, True)
     assert_close(target_grad, linear_conv_col.weight.grad)
 
 
diff --git a/tests/test_shardformer/test_layer/test_qkv_fused_linear_1d.py b/tests/test_shardformer/test_layer/test_qkv_fused_linear_1d.py
index dc14fd59175a..fccba564f7c7 100644
--- a/tests/test_shardformer/test_layer/test_qkv_fused_linear_1d.py
+++ b/tests/test_shardformer/test_layer/test_qkv_fused_linear_1d.py
@@ -2,13 +2,12 @@
 from contextlib import nullcontext
 
 import torch
-import torch.distributed as dist
 import torch.nn as nn
 from torch.testing import assert_close
 
 import colossalai
 from colossalai.lazy import LazyInitContext
-from colossalai.shardformer.layer import GPT2FusedLinearConv1D_Col, GPT2FusedLinearConv1D_Row
+from colossalai.shardformer.layer import FusedLinear1D_Col, FusedLinear1D_Row
 from colossalai.shardformer.layer.qkv_fused_linear import split_fused_qkv_in_gpt2_style
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
 
@@ -16,93 +15,55 @@
 os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
 
 
-class Conv1D(nn.Module):
-    """
-    1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).
-
-    Basically works like a linear layer but the weights are transposed.
-
-    Args:
-        nf (`int`): The number of output features.
-        nx (`int`): The number of input features.
-    """
-
-    def __init__(self, nf, nx):
-        super().__init__()
-        self.nf = nf
-        self.weight = nn.Parameter(torch.empty(nx, nf))
-        self.bias = nn.Parameter(torch.zeros(nf))
-        nn.init.normal_(self.weight, std=0.02)
-
-    def forward(self, x):
-        size_out = x.size()[:-1] + (self.nf,)
-        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
-        x = x.view(size_out)
-        return x
-
-
-def rearrange(tensor: torch.Tensor, dim: int):
-    tensor = tensor.clone()
-    world_size = 2
-    order = torch.arange(world_size * 3)
-    new_order = []
-    for i in range(world_size):
-        new_order.append(order[i::world_size])
-    new_order = torch.cat(new_order)
-
-    tensor_chunks = torch.chunk(tensor, world_size * 3, dim=dim)
-    rearanged_tensor_chunks = [tensor_chunks[i] for i in new_order]
-    rearanged_tensor = torch.cat(rearanged_tensor_chunks, dim=dim)
-    return rearanged_tensor
-
-
 @parameterize("lazy_init", [False, True])
-def check_linear_conv_1d_col(lazy_init: bool):
+def check_linear_1d_col(lazy_init: bool):
     ctx = LazyInitContext() if lazy_init else nullcontext()
-    linear = Conv1D(192, 48).cuda()
+    linear = nn.Linear(8, 80).cuda()
     with ctx:
-        linear_copy = Conv1D(192, 48).cuda()
-    linear_conv_col = GPT2FusedLinearConv1D_Col.from_native_module(
-        linear_copy, process_group=None, gather_output=True, n_fused=3
+        linear_copy = nn.Linear(8, 80).cuda()
+    linear_col = FusedLinear1D_Col.from_native_module(
+        linear_copy, process_group=None, gather_output=True, split_sizes=[32, 32, 16]
     )
 
-    assert linear.weight.shape == torch.Size([48, 192])
-    assert linear.bias.shape == torch.Size([192])
-    assert linear_conv_col.weight.shape == torch.Size([48, 96])
-    assert linear_conv_col.bias.shape == torch.Size([96])
-    assert linear_copy.weight is linear_conv_col.weight
-    assert linear_copy.bias is linear_conv_col.bias
+    assert linear.weight.shape == torch.Size([80, 8])
+    assert linear.bias.shape == torch.Size([80])
+    assert linear_col.weight.shape == torch.Size([40, 8])
+    assert linear_col.bias.shape == torch.Size([40])
+    assert linear_copy.weight is linear_col.weight
+    assert linear_copy.bias is linear_col.bias
 
     # ensure weights are reversibly loadable
-    linear_conv_col.load_state_dict(linear.state_dict())
-    linear.load_state_dict(linear_conv_col.state_dict())
+    linear_col.load_state_dict(linear.state_dict())
+    linear.load_state_dict(linear_col.state_dict())
 
     # check computation correctness
-    x = torch.rand(4, 48).cuda()
+    x = torch.rand(4, 8).cuda()
     out = linear(x)
-    gather_out = linear_conv_col(x)
-    assert_close(rearrange(out, 1), gather_out)
+    gather_out = linear_col(x)
+    assert_close(out, gather_out)
 
     # check backward correctness
     out.sum().backward()
     gather_out.sum().backward()
 
-    target_grad = split_fused_qkv_in_gpt2_style(linear.weight.grad, 3, None, True)
-    assert_close(target_grad, linear_conv_col.weight.grad)
+    target_grad = split_fused_qkv_in_gpt2_style(linear.weight.grad, [32, 32, 16], None, False)
+    assert_close(target_grad, linear_col.weight.grad)
 
 
 @parameterize("lazy_init", [False, True])
-def check_linear_conv_1d_row(lazy_init: bool):
+def check_linear_1d_row(lazy_init: bool):
     ctx = LazyInitContext() if lazy_init else nullcontext()
 
-    linear = Conv1D(192, 48).cuda()
+    linear = nn.Linear(80, 8).cuda()
     with ctx:
-        linear_copy = Conv1D(192, 48).cuda()
-    linear_row = GPT2FusedLinearConv1D_Row.from_native_module(linear_copy, process_group=None, parallel_input=False)
+        linear_copy = nn.Linear(80, 8).cuda()
+    linear_row = FusedLinear1D_Row.from_native_module(
+        linear_copy, process_group=None, split_sizes=[32, 32, 16], parallel_input=False
+    )
 
-    assert linear.weight.shape == torch.Size([48, 192])
-    assert linear_row.weight.shape == torch.Size([24, 192])
-    assert linear_row.bias.shape == torch.Size([192])
+    assert linear.weight.shape == torch.Size([8, 80])
+    assert linear_row.weight.shape == torch.Size([8, 40])
+    assert linear_row.bias.shape == torch.Size([8])
     assert linear_copy.weight is linear_row.weight
     assert linear_copy.bias is linear_row.bias
 
@@ -111,7 +72,7 @@ def check_linear_conv_1d_row(lazy_init: bool):
     linear.load_state_dict(linear_row.state_dict())
 
     # check computation correctness
-    x = torch.rand(4, 48).cuda()
+    x = torch.rand(4, 80).cuda()
     out = linear(x)
     gather_out = linear_row(x)
     assert_close(out, gather_out)
@@ -120,17 +81,51 @@ def check_linear_conv_1d_row(lazy_init: bool):
     out.sum().backward()
     gather_out.sum().backward()
 
-    rank = dist.get_rank()
-    target_grad = torch.chunk(linear.weight.grad, 2, dim=0)[rank]
+    target_grad = split_fused_qkv_in_gpt2_style(linear.weight.grad, [32, 32, 16], None, True)
     assert_close(target_grad, linear_row.weight.grad)
 
 
+@parameterize("lazy_init", [False, True])
+def check_linear_1d_col_row(lazy_init: bool):
+    ctx = LazyInitContext() if lazy_init else nullcontext()
+
+    linear1 = nn.Linear(8, 80).cuda()
+    linear2 = nn.Linear(80, 8).cuda()
+    with ctx:
+        linear1_copy = nn.Linear(8, 80).cuda()
+        linear2_copy = nn.Linear(80, 8).cuda()
+    linear_col = FusedLinear1D_Col.from_native_module(linear1_copy, process_group=None, split_sizes=[32, 32, 16])
+    linear_row = FusedLinear1D_Row.from_native_module(
+        linear2_copy,
+        process_group=None,
+        split_sizes=[32, 32, 16],
+    )
+    # ensure weights are reversibly loadable
+    linear_col.load_state_dict(linear1.state_dict())
+    linear_row.load_state_dict(linear2.state_dict())
+
+    # check computation correctness
+    x = torch.rand(4, 8).cuda()
+    target_out = linear2(linear1(x))
+    out = linear_row(linear_col(x))
+    assert_close(out, target_out)
+
+    # check backward correctness
+    target_out.sum().backward()
+    out.sum().backward()
+
+    target_grad1 = split_fused_qkv_in_gpt2_style(linear1.weight.grad, [32, 32, 16], None, False)
+    assert_close(target_grad1, linear_col.weight.grad)
+    target_grad2 = split_fused_qkv_in_gpt2_style(linear2.weight.grad, [32, 32, 16], None, True)
+    assert_close(target_grad2, linear_row.weight.grad)
+
+
 def run_dist(rank, world_size, port):
     colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
 
-    # test for linear conv
-    check_linear_conv_1d_col()
-    check_linear_conv_1d_row()
+    check_linear_1d_col()
+    check_linear_1d_row()
+    check_linear_1d_col_row()
 
 
 @rerun_if_address_is_in_use()

From e234dfa236e9f94f250c5858efdd0cd607326fdd Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Thu, 10 Oct 2024 06:57:35 +0000
Subject: [PATCH 108/167] [feat] support MixtralPipelineForwards-->
 mixtral_for_causal_lm_forward for zbv

---
 colossalai/shardformer/modeling/mixtral.py    | 235 ++++++++++++++----
 .../test_schedule/test_zerobubble_pp.py       |   2 +
 2 files changed, 194 insertions(+), 43 deletions(-)

diff --git a/colossalai/shardformer/modeling/mixtral.py b/colossalai/shardformer/modeling/mixtral.py
index d1e44aa5bebb..3709af54c486 100644
--- a/colossalai/shardformer/modeling/mixtral.py
+++ b/colossalai/shardformer/modeling/mixtral.py
@@ -679,52 +679,201 @@ def mixtral_for_causal_lm_forward(
         )
         past_key_values = None
 
-        if stage_manager.is_last_stage():
-            hidden_states = outputs[0]
-            logits = self.lm_head(hidden_states)
-            logits = logits.float()
-
-            loss = None
-            if labels is not None:
-                # Shift so that tokens < n predict n
-                shift_logits = logits[..., :-1, :].contiguous()
-                shift_labels = labels[..., 1:].contiguous()
-                # Flatten the tokens
-                loss_fct = CrossEntropyLoss()
-                shift_logits = shift_logits.view(-1, self.config.vocab_size)
-                shift_labels = shift_labels.view(-1)
-                # Enable model parallelism
-                shift_labels = shift_labels.to(shift_logits.device)
-                loss = loss_fct(shift_logits, shift_labels)
-
-            aux_loss = None
-            if output_router_logits:
-                aux_loss = load_balancing_loss_func(outputs[-1], self.num_experts, self.num_experts_per_tok)
+        #######
+        # Attention, we support consider 1f1b, interleaved, zbv
+        #######
+        if stage_manager.is_interleave:
+            if stage_manager.use_zbv:
+                # zbv
+                if stage_manager.is_first_stage(ignore_chunk=True) and stage_manager.model_chunk_id == 1:
+                    hidden_states = outputs[0]
+                    logits = self.lm_head(hidden_states)
+                    logits = logits.float()
+
+                    loss = None
+                    if labels is not None:
+                        # Shift so that tokens < n predict n
+                        shift_logits = logits[..., :-1, :].contiguous()
+                        shift_labels = labels[..., 1:].contiguous()
+                        # Flatten the tokens
+                        loss_fct = CrossEntropyLoss()
+                        shift_logits = shift_logits.view(-1, self.config.vocab_size)
+                        shift_labels = shift_labels.view(-1)
+                        # Enable model parallelism
+                        shift_labels = shift_labels.to(shift_logits.device)
+                        loss = loss_fct(shift_logits, shift_labels)
+
+                    aux_loss = None
+                    if output_router_logits:
+                        aux_loss = load_balancing_loss_func(outputs[-1], self.num_experts, self.num_experts_per_tok)
+                        if labels is not None:
+                            loss += self.router_aux_loss_coef * aux_loss
+
+                    if not return_dict:
+                        output = (logits,) + outputs[1:]
+                        if output_router_logits:
+                            output = (aux_loss,) + output
+                        return (loss,) + output if loss is not None else output
+
+                    return MoeCausalLMOutputWithPast(
+                        loss=loss,
+                        aux_loss=aux_loss,
+                        logits=logits,
+                        past_key_values=None,
+                        hidden_states=outputs[0],
+                        attentions=None,
+                        router_logits=outputs[-1],
+                    )
+                else:
+                    out = {}
+                    hidden_states = outputs.get("hidden_states")
+                    out["hidden_states"] = hidden_states
+                    if output_router_logits:
+                        out["past_router_logits"] = outputs["past_router_logits"]
+                    return out
+            else:
+                # interleaved
+                if stage_manager.is_last_stage(ignore_chunk=True):
+                    hidden_states = outputs[0]
+                    logits = self.lm_head(hidden_states)
+                    logits = logits.float()
+
+                    loss = None
+                    if labels is not None:
+                        # Shift so that tokens < n predict n
+                        shift_logits = logits[..., :-1, :].contiguous()
+                        shift_labels = labels[..., 1:].contiguous()
+                        # Flatten the tokens
+                        loss_fct = CrossEntropyLoss()
+                        shift_logits = shift_logits.view(-1, self.config.vocab_size)
+                        shift_labels = shift_labels.view(-1)
+                        # Enable model parallelism
+                        shift_labels = shift_labels.to(shift_logits.device)
+                        loss = loss_fct(shift_logits, shift_labels)
+
+                    aux_loss = None
+                    if output_router_logits:
+                        aux_loss = load_balancing_loss_func(outputs[-1], self.num_experts, self.num_experts_per_tok)
+                        if labels is not None:
+                            loss += self.router_aux_loss_coef * aux_loss
+
+                    if not return_dict:
+                        output = (logits,) + outputs[1:]
+                        if output_router_logits:
+                            output = (aux_loss,) + output
+                        return (loss,) + output if loss is not None else output
+
+                    return MoeCausalLMOutputWithPast(
+                        loss=loss,
+                        aux_loss=aux_loss,
+                        logits=logits,
+                        past_key_values=None,
+                        hidden_states=outputs[0],
+                        attentions=None,
+                        router_logits=outputs[-1],
+                    )
+                else:
+                    out = {}
+                    hidden_states = outputs.get("hidden_states")
+                    out["hidden_states"] = hidden_states
+                    if output_router_logits:
+                        out["past_router_logits"] = outputs["past_router_logits"]
+                    return out
+        else:
+            # 1f1b or otherwise
+            if stage_manager.is_last_stage():
+                hidden_states = outputs[0]
+                logits = self.lm_head(hidden_states)
+                logits = logits.float()
+
+                loss = None
                 if labels is not None:
-                    loss += self.router_aux_loss_coef * aux_loss
+                    # Shift so that tokens < n predict n
+                    shift_logits = logits[..., :-1, :].contiguous()
+                    shift_labels = labels[..., 1:].contiguous()
+                    # Flatten the tokens
+                    loss_fct = CrossEntropyLoss()
+                    shift_logits = shift_logits.view(-1, self.config.vocab_size)
+                    shift_labels = shift_labels.view(-1)
+                    # Enable model parallelism
+                    shift_labels = shift_labels.to(shift_logits.device)
+                    loss = loss_fct(shift_logits, shift_labels)
+
+                aux_loss = None
+                if output_router_logits:
+                    aux_loss = load_balancing_loss_func(outputs[-1], self.num_experts, self.num_experts_per_tok)
+                    if labels is not None:
+                        loss += self.router_aux_loss_coef * aux_loss
 
-            if not return_dict:
-                output = (logits,) + outputs[1:]
+                if not return_dict:
+                    output = (logits,) + outputs[1:]
+                    if output_router_logits:
+                        output = (aux_loss,) + output
+                    return (loss,) + output if loss is not None else output
+
+                return MoeCausalLMOutputWithPast(
+                    loss=loss,
+                    aux_loss=aux_loss,
+                    logits=logits,
+                    past_key_values=None,
+                    hidden_states=outputs[0],
+                    attentions=None,
+                    router_logits=outputs[-1],
+                )
+            else:
+                out = {}
+                hidden_states = outputs.get("hidden_states")
+                out["hidden_states"] = hidden_states
                 if output_router_logits:
-                    output = (aux_loss,) + output
-                return (loss,) + output if loss is not None else output
-
-            return MoeCausalLMOutputWithPast(
-                loss=loss,
-                aux_loss=aux_loss,
-                logits=logits,
-                past_key_values=None,
-                hidden_states=outputs[0],
-                attentions=None,
-                router_logits=outputs[-1],
-            )
-        else:
-            out = {}
-            hidden_states = outputs.get("hidden_states")
-            out["hidden_states"] = hidden_states
-            if output_router_logits:
-                out["past_router_logits"] = outputs["past_router_logits"]
-            return out
+                    out["past_router_logits"] = outputs["past_router_logits"]
+                return out
+
+        # if stage_manager.is_last_stage():
+        #     hidden_states = outputs[0]
+        #     logits = self.lm_head(hidden_states)
+        #     logits = logits.float()
+
+        #     loss = None
+        #     if labels is not None:
+        #         # Shift so that tokens < n predict n
+        #         shift_logits = logits[..., :-1, :].contiguous()
+        #         shift_labels = labels[..., 1:].contiguous()
+        #         # Flatten the tokens
+        #         loss_fct = CrossEntropyLoss()
+        #         shift_logits = shift_logits.view(-1, self.config.vocab_size)
+        #         shift_labels = shift_labels.view(-1)
+        #         # Enable model parallelism
+        #         shift_labels = shift_labels.to(shift_logits.device)
+        #         loss = loss_fct(shift_logits, shift_labels)
+
+        #     aux_loss = None
+        #     if output_router_logits:
+        #         aux_loss = load_balancing_loss_func(outputs[-1], self.num_experts, self.num_experts_per_tok)
+        #         if labels is not None:
+        #             loss += self.router_aux_loss_coef * aux_loss
+
+        #     if not return_dict:
+        #         output = (logits,) + outputs[1:]
+        #         if output_router_logits:
+        #             output = (aux_loss,) + output
+        #         return (loss,) + output if loss is not None else output
+
+        #     return MoeCausalLMOutputWithPast(
+        #         loss=loss,
+        #         aux_loss=aux_loss,
+        #         logits=logits,
+        #         past_key_values=None,
+        #         hidden_states=outputs[0],
+        #         attentions=None,
+        #         router_logits=outputs[-1],
+        #     )
+        # else:
+        #     out = {}
+        #     hidden_states = outputs.get("hidden_states")
+        #     out["hidden_states"] = hidden_states
+        #     if output_router_logits:
+        #         out["past_router_logits"] = outputs["past_router_logits"]
+        #     return out
 
 
 def get_mixtral_flash_attention_forward(shard_config, sp_mode=None, sp_size=None, sp_group=None):
diff --git a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
index 384ed649055c..1e8f1392e470 100644
--- a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
+++ b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
@@ -786,6 +786,8 @@ def run_with_booster_moehybridplugin(config: Tuple[int, ...]):
     seed_all(10086)
 
     torch_model = MixtralModel(config).to(dtype).cuda()
+    # TODO: Support MixtralForCausalLM
+    # torch_model = MixtralForCausalLM(config).to(dtype).cuda()
     torch_optimizer = torch.optim.SGD(torch_model.parameters(), lr=1)
     # init schedule
     h, a, s = config.hidden_size, config.num_attention_heads, 1024

From f98384aef67f413310d507d3dc873578e7ee6a39 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Thu, 10 Oct 2024 15:17:06 +0800
Subject: [PATCH 109/167] fix

---
 colossalai/shardformer/layer/attn.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 3853860c4636..75718f608065 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -468,27 +468,29 @@ def get_double_ring_groups(sp_group, tp_group, inner_ring_size=None):
             f"Using 2D Ring Attention with inner ring size {inner_ring_size} to maximze NIC util for inter-node comm. Cross your fingers for speed-ups!",
             ranks=[0],
         )
-
+        num_rings = sp_size // inner_ring_size
         inner_ring_group = None
         inter_ring_group = None
 
         world_size = dist.get_world_size()
         rank = dist.get_rank()
 
-        inner_rings = world_size // sp_size
-        num_rings = sp_size // inner_ring_size
+        num_ring_size = world_size // num_rings
+        num_inner_group = num_ring_size // inner_ring_size
 
         if tp_size > 1:
-            for i in range(inner_rings):
-                for j in range(sp_size // tp_size):
-                    # find inner ring group in one sp group
-                    ranks = list(range(j + i * sp_size, j + (i + 1) * sp_size, tp_size))
+            # inner_ring_size = 2
+            for i in range(num_rings):
+                for j in range(num_inner_group):
+                    # find inner ring group in one sp groups
+                    ranks = list(range(j + i * num_ring_size, j + (i + 1) * num_ring_size, tp_size))
                     group = dist.new_group(ranks)
                     if rank in ranks:
                         inner_ring_group = group
-            for i in range(inner_rings):
-                for j in range(sp_size // tp_size):
-                    ranks = list(range(j + i * (sp_size // tp_size), inner_rings + (i + 1) * sp_size, sp_size))
+            for i in range(num_rings):
+                for j in range(num_inner_group):
+                    start = j + (i * num_inner_group)
+                    ranks = list(range(start, start + num_ring_size + 1, num_ring_size))
                     group = dist.new_group(ranks)
                     if rank in ranks:
                         inter_ring_group = group

From 5ecc27e1509575e605c47279f243491e20968558 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Thu, 10 Oct 2024 15:35:52 +0800
Subject: [PATCH 110/167] fix

---
 colossalai/shardformer/layer/attn.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 75718f608065..a191694c137b 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -479,7 +479,6 @@ def get_double_ring_groups(sp_group, tp_group, inner_ring_size=None):
         num_inner_group = num_ring_size // inner_ring_size
 
         if tp_size > 1:
-            # inner_ring_size = 2
             for i in range(num_rings):
                 for j in range(num_inner_group):
                     # find inner ring group in one sp groups

From 6b2c506fc55d973888078eba4803bd089c532d0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=A2=81=E7=88=BD?=
 <100194095+supercooledith@users.noreply.github.com>
Date: Thu, 10 Oct 2024 17:02:49 +0800
Subject: [PATCH 111/167] Update README.md (#6087)

add HPC-AI.COM activity
---
 README.md | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/README.md b/README.md
index dcd1e261a263..bf26e154f0b0 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,28 @@
 
 </div>
 
+## GPU Cloud HPC-AI.COM Coming！！
+
+For a limited time, you can access an H100 Server for just $1! This is your chance to leverage premium GPU power at an unbeatable price.
+Plus, when you refer a friend, you’ll receive 20% cashback or compute credits equal to 100% of their top-up!
+
+Our platform offers on-demand premium compute, ensuring safe, permanent data storage even after stopping your instance.
+Don’t miss this incredible opportunity to accelerate your AI projects!
+
+Unlock premium GPUs and register now at [HPC-AI.COM](https://hpc-ai.com) to receive $10!
+
+Special Bonuses:
+
+* Top up $1,000 and receive 300 credits
+* Top up $500 and receive 100 credits
+
+<div align="center">
+   <a href="https://youtu.be/ilMQpU71ddI?si=J4JSPzZ03ycYmlki">
+   <img src="https://github.com/hpcaitech/public_assets/blob/main/colossalai/img/HPCAICOM241010.jpg" width="700" />
+   </a>
+</div>
+
+
 ## Latest News
 * [2024/09] [Singapore Startup HPC-AI Tech Secures 50 Million USD in Series A Funding to Build the Video Generation AI Model and GPU Platform](https://company.hpc-ai.com/blog/singapore-startup-hpc-ai-tech-secures-50-million-usd-in-series-a-funding-to-build-the-video-generation-ai-model-and-gpu-platform)
 * [2024/09] [Reducing AI Large Model Training Costs by 30% Requires Just a Single Line of Code From FP8 Mixed Precision Training Upgrades](https://company.hpc-ai.com/blog/reducing-ai-large-model-training-costs-by-30-requires-just-a-single-line-of-code-from-fp8-mixed-precision-training-upgrades)

From efe3042bb235064b4ebff8f9549d6eb93d653302 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Thu, 10 Oct 2024 18:38:47 +0800
Subject: [PATCH 112/167] fix

---
 colossalai/shardformer/layer/attn.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index a191694c137b..7c25bee1a3a2 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -482,7 +482,8 @@ def get_double_ring_groups(sp_group, tp_group, inner_ring_size=None):
             for i in range(num_rings):
                 for j in range(num_inner_group):
                     # find inner ring group in one sp groups
-                    ranks = list(range(j + i * num_ring_size, j + (i + 1) * num_ring_size, tp_size))
+                    start = j + i * num_ring_size
+                    ranks = list(range(start, start + tp_size * inner_ring_size, tp_size))
                     group = dist.new_group(ranks)
                     if rank in ranks:
                         inner_ring_group = group

From dc2cdaf3e8bc865ae5b8d653230876bba8dbf787 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Fri, 11 Oct 2024 13:44:40 +0800
Subject: [PATCH 113/167] [shardformer] optimize seq parallelism (#6086)

* [shardformer] optimize seq parallelism

* [shardformer] fix gpt2 fused linear col

* [plugin] update gemini plugin

* [plugin] update moe hybrid plugin

* [test] update gpt2 fused linear test

* [shardformer] fix gpt2 fused linear reduce
---
 colossalai/booster/plugin/gemini_plugin.py    |   4 -
 .../booster/plugin/hybrid_parallel_plugin.py  |   3 -
 .../plugin/moe_hybrid_parallel_plugin.py      |   3 -
 colossalai/shardformer/layer/_operation.py    | 243 ++++++------------
 colossalai/shardformer/layer/linear.py        |  34 +--
 .../shardformer/layer/qkv_fused_linear.py     |  66 ++---
 colossalai/shardformer/policies/bert.py       |   5 -
 colossalai/shardformer/policies/bloom.py      |   3 -
 colossalai/shardformer/policies/chatglm2.py   |   2 -
 colossalai/shardformer/policies/gpt2.py       |   3 -
 colossalai/shardformer/policies/gptj.py       |   4 -
 colossalai/shardformer/shard/shard_config.py  |  15 --
 .../test_gpt2_qkv_fused_linear_1d.py          |   8 +-
 13 files changed, 113 insertions(+), 280 deletions(-)

diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py
index ae49aa8b148d..4c8258113018 100644
--- a/colossalai/booster/plugin/gemini_plugin.py
+++ b/colossalai/booster/plugin/gemini_plugin.py
@@ -322,7 +322,6 @@ class GeminiPlugin(DPPluginBase):
         enable_flash_attention (bool, optional): Whether to switch on flash attention in Shardformer. Defaults to False.
         enable_jit_fused (bool, optional): Whether to switch on JIT in Shardformer. Default to False.
         enable_sequence_parallelism (bool): Whether to turn on sequence parallelism in Shardformer. Defaults to False.
-        enable_sequence_overlap (bool): Whether to turn on sequence overlap in Shardformer. Defaults to False.
         use_fp8 (bool, optional): Whether to enable fp8 mixed precision training. Defaults to False.
         verbose (bool, optional): verbose mode. Debug info including chunk search result will be printed. Defaults to False.
         fp8_communication (bool, optional): Whether to enable fp8 communication. Defaults to False.
@@ -366,7 +365,6 @@ def __init__(
         enable_flash_attention: bool = False,
         enable_sequence_parallelism: bool = False,
         enable_jit_fused: bool = False,
-        enable_sequence_overlap: bool = False,
         enable_async_reduce: bool = True,
         use_fp8: bool = False,
         verbose: bool = False,
@@ -428,7 +426,6 @@ def __init__(
         self.enable_flash_attention = enable_flash_attention
         self.enable_sequence_parallelism = enable_sequence_parallelism if self.enable_tensor_parallelism else False
         self.enable_jit_fused = enable_jit_fused
-        self.enable_sequence_overlap = enable_sequence_overlap
         self.verbose = verbose
 
         self.tp_size = tp_size
@@ -455,7 +452,6 @@ def __init__(
             enable_flash_attention=self.enable_flash_attention,
             enable_jit_fused=self.enable_jit_fused,
             enable_sequence_parallelism=self.enable_sequence_parallelism,
-            enable_sequence_overlap=self.enable_sequence_overlap,
         )
 
     def __del__(self):
diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index bb663f6a6d5e..0674451a4c7e 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -951,7 +951,6 @@ class HybridParallelPlugin(PipelinePluginBase):
         enable_jit_fused (bool, optional): Whether to switch on JIT in Shardformer. Default to False.
         enable_sequence_parallelism (bool): Whether to turn on sequence parallelism in Shardformer. Defaults to False.
         sequence_parallelism_mode (str): The Sequence parallelism mode. Can only be choosed from ["split_gather", "ring", "all_to_all"]. Defaults to "split_gather".
-        enable_sequence_overlap (bool): Whether to turn on sequence overlap in Shardformer. Defaults to False.
         parallel_output (bool): Whether to keep the output parallel when enabling tensor parallelism. Default to True.
         num_microbatches (int, optional): Number of microbatches when using pipeline parallelism. Defaults to None.
         microbatch_size (int, optional): Microbatch size when using pipeline parallelism.
@@ -1002,7 +1001,6 @@ def __init__(
         enable_jit_fused: bool = False,
         enable_sequence_parallelism: bool = False,
         sequence_parallelism_mode: str = None,
-        enable_sequence_overlap: bool = False,
         parallel_output: bool = True,
         num_microbatches: Optional[int] = None,
         microbatch_size: Optional[int] = None,
@@ -1174,7 +1172,6 @@ def __init__(
             enable_jit_fused=self.enable_jit_fused,
             enable_sequence_parallelism=enable_sequence_parallelism,
             sequence_parallelism_mode=sequence_parallelism_mode,
-            enable_sequence_overlap=enable_sequence_overlap,
             parallel_output=parallel_output,
             make_vocab_size_divisible_by=make_vocab_size_divisible_by,
             gradient_checkpoint_config=gradient_checkpoint_config,
diff --git a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
index 0807b374901a..2f08d21833f3 100644
--- a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
@@ -140,7 +140,6 @@ class MoeHybridParallelPlugin(HybridParallelPlugin):
         enable_jit_fused (bool, optional): Whether to switch on JIT in Shardformer. Default to False.
         enable_sequence_parallelism (bool): Whether to turn on sequence parallelism in Shardformer. Defaults to False.
         sequence_parallelism_mode (str): The Sequence parallelism mode. Can only be choosed from ["split_gather", "ring", "all_to_all"]. Defaults to "split_gather".
-        enable_sequence_overlap (bool): Whether to turn on sequence overlap in Shardformer. Defaults to False.
         parallel_output (bool): Whether to keep the output parallel when enabling tensor parallelism. Default to True.
         num_microbatches (int, optional): Number of microbatches when using pipeline parallelism. Defaults to None.
         microbatch_size (int, optional): Microbatch size when using pipeline parallelism.
@@ -189,7 +188,6 @@ def __init__(
         enable_jit_fused: bool = False,
         enable_sequence_parallelism: bool = False,
         sequence_parallelism_mode: str = None,
-        enable_sequence_overlap: bool = False,
         parallel_output: bool = True,
         num_microbatches: Optional[int] = None,
         microbatch_size: Optional[int] = None,
@@ -351,7 +349,6 @@ def __init__(
             enable_jit_fused=self.enable_jit_fused,
             enable_sequence_parallelism=enable_sequence_parallelism,
             sequence_parallelism_mode=sequence_parallelism_mode,
-            enable_sequence_overlap=enable_sequence_overlap,
             parallel_output=parallel_output,
             make_vocab_size_divisible_by=make_vocab_size_divisible_by,
             gradient_checkpoint_config=gradient_checkpoint_config,
diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index 1d7a1f1042cc..5499443b65a5 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -102,7 +102,7 @@ def backward(ctx, grad_output):
             grad_output = grad_output.view(-1, grad_output.shape[-1])
             total_input = total_input.view(-1, total_input.shape[-1])
 
-        if ctx.async_grad_allreduce and fp8_communication:
+        if fp8_communication or not ctx.async_grad_allreduce:
             _reduce(grad_input, group=ctx.process_group, fp8_communication=fp8_communication, fp8_format="e5m2")
         elif ctx.async_grad_allreduce:
             # Asynchronous all-reduce
@@ -216,10 +216,12 @@ def switch_step():
         for k in recv_tensors:
             send_tensors[k], recv_tensors[k] = recv_tensors[k], send_tensors[k]
 
+    input_tensors = []
     output_tensors = []
 
     handles = communicate_step()
     # first round: special case, retrive from local tensor
+    input_tensors.append(input_to_gather)
     output_tensors.append(func(**input_to_gather, **input_local))
     for i in range(group_size - 2):
         for handle in handles:
@@ -230,14 +232,25 @@ def switch_step():
         handles = communicate_step()
 
         # actual computation
+        input_tensors.append(send_tensors)
         output_tensors.append(func(**send_tensors, **input_local))
 
     # final round: special case, no need to send/recv again
     for handle in handles:
         handle.wait()
+    input_tensors.append(send_tensors)
     output_tensors.append(func(**recv_tensors, **input_local))
 
-    return torch.cat(output_tensors[group_size - cur_rank :] + output_tensors[: group_size - cur_rank], dim=gather_dim)
+    gathered_input = {}
+    for k in input_to_gather:
+        input_shards = [d[k] for d in input_tensors[group_size - cur_rank :] + input_tensors[: group_size - cur_rank]]
+        gathered_input[k] = torch.cat(input_shards, dim=gather_dim)
+
+    gathered_output = torch.cat(
+        output_tensors[group_size - cur_rank :] + output_tensors[: group_size - cur_rank], dim=gather_dim
+    )
+
+    return gathered_output, gathered_input
 
 
 class _GatherForwardReduceScatterBackward(torch.autograd.Function):
@@ -293,29 +306,30 @@ class _LinearWithGatherForwardReduceScatterBackward(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, input_, weight, bias, process_group, async_grad_reduce_scatter, dim, overlap=True, ring=False):
+    def forward(ctx, input_, weight, bias, process_group, async_grad_reduce_scatter, dim, ring=False):
         ctx.save_for_backward(input_, weight, bias)
         ctx.use_bias = bias is not None
         ctx.process_group = process_group
         ctx.async_grad_reduce_scatter = async_grad_reduce_scatter
         ctx.dim = dim
-        ctx.overlap = overlap
 
         if ring is True:
             input_to_gather = {"input": input_}
             input_local = {"weight": weight}
 
-            output = _ring_as_gather(
+            output, input_dict = _ring_as_gather(
                 F.linear,
                 input_to_gather=input_to_gather,
                 input_local=input_local,
                 process_group=process_group,
             )
+            ctx.gathered_input = input_dict["input"]
 
             if bias is not None:
                 output += bias
         else:
             input_parallel = _gather(input_, dim, process_group)
+            ctx.gathered_input = input_parallel
             if bias is not None:
                 output = F.linear(input_parallel, weight, bias)
             else:
@@ -329,100 +343,50 @@ def backward(ctx, grad_output):
         use_bias = ctx.use_bias
         dim = ctx.dim
         process_group = ctx.process_group
-        overlap = ctx.overlap
 
         # In order to be hooked into Gemini's '__torch_function__', adding a view operation to weight and bias. Used in FusedLayerNorm
         if use_bias:
             bias = bias.view(bias.shape)
 
-        if not overlap:
-            input_parallel = _gather(input_, dim, process_group)
-
-            total_input = input_parallel
-            grad_input = grad_output.matmul(weight)
-            grad_output = grad_output.contiguous()
-            # Convert the tensor shapes to 2D for execution compatibility
-            if len(grad_output.shape) > 2:
-                grad_output = grad_output.view(-1, grad_output.shape[-1])
-                total_input = total_input.view(-1, total_input.shape[-1])
-
-            if ctx.async_grad_reduce_scatter:
-                # Asynchronous reduce-scatter
-                input_list = [
-                    item.contiguous() for item in torch.chunk(grad_input, dist.get_world_size(process_group), dim=dim)
-                ]
-                output = torch.empty(
-                    input_.shape, dtype=input_parallel.dtype, device=input_parallel.device
-                ).contiguous()
-                handle = dist.reduce_scatter(output, input_list, group=process_group, async_op=True)
-                # Rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to have
-                # all-reduce scheduled first and have GPU resources allocated, CUDA_DEVICE_MAX_CONNECTIONS=1 is set in shardformer.py
-
-            if _grad_accum_fusion_available and weight.grad is not None:
-                grad = weight.grad
-                if grad.dtype == torch.float32:
-                    fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(total_input, grad_output, grad)
-                    grad_weight = None
-                elif grad.dtype == torch.float16:
-                    fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16(total_input, grad_output, grad)
-                    grad_weight = None
-                else:
-                    grad_weight = grad_output.t().matmul(total_input)
-            else:
-                grad_weight = grad_output.t().matmul(total_input)
-
-            grad_bias = grad_output.sum(dim=0) if use_bias else None
+        input_parallel = ctx.gathered_input
 
-            if ctx.async_grad_reduce_scatter:
-                handle.wait()
+        total_input = input_parallel
+        grad_input = grad_output.matmul(weight)
+        grad_output = grad_output.contiguous()
+        # Convert the tensor shapes to 2D for execution compatibility
+        if len(grad_output.shape) > 2:
+            grad_output = grad_output.view(-1, grad_output.shape[-1])
+            total_input = total_input.view(-1, total_input.shape[-1])
 
-        else:
-            input_ = input_.contiguous()
-            world_size = dist.get_world_size(process_group)
-            tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
-
-            # do all gather in is async way
-            gather_handle = dist.all_gather(tensor_list, input_, group=process_group, async_op=True)
-            # calculate gradient and prepare data asynchronously with all-gather
-            # calculate
-            grad_input = grad_output.matmul(weight)
-            grad_output = grad_output.contiguous()
-            # Convert the tensor shapes to 2D for execution compatibility
-            if len(grad_output.shape) > 2:
-                grad_output = grad_output.view(-1, grad_output.shape[-1])
-            grad_bias = grad_output.sum(dim=0) if use_bias else None
-            # prepare data
+        if ctx.async_grad_reduce_scatter:
+            # Asynchronous reduce-scatter
             input_list = [
                 item.contiguous() for item in torch.chunk(grad_input, dist.get_world_size(process_group), dim=dim)
             ]
-            output = torch.empty(input_.shape, dtype=input_.dtype, device=input_.device).contiguous()
-            # wait until all-gather finished
-            gather_handle.wait()
-
-            # do reduce-scatter in async way
-            reducescatter_handle = dist.reduce_scatter(output, input_list, group=process_group, async_op=True)
-            input_parallel = torch.cat(tensor_list, dim=dim).contiguous()
-            # calculate gradient
-            if len(input_parallel.shape) > 2:
-                input_parallel = input_parallel.view(-1, input_parallel.shape[-1])
-
-            if _grad_accum_fusion_available and weight.grad is not None:
-                grad = weight.grad
-                if grad.dtype == torch.float32:
-                    fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(input_parallel, grad_output, grad)
-                    grad_weight = None
-                elif grad.dtype == torch.float16:
-                    fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16(input_parallel, grad_output, grad)
-                    grad_weight = None
-                else:
-                    grad_weight = grad_output.t().matmul(input_parallel)
+            output = torch.empty(input_.shape, dtype=input_parallel.dtype, device=input_parallel.device).contiguous()
+            handle = dist.reduce_scatter(output, input_list, group=process_group, async_op=True)
+            # Rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to have
+            # all-reduce scheduled first and have GPU resources allocated, CUDA_DEVICE_MAX_CONNECTIONS=1 is set in shardformer.py
+
+        if _grad_accum_fusion_available and weight.grad is not None:
+            grad = weight.grad
+            if grad.dtype == torch.float32:
+                fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(total_input, grad_output, grad)
+                grad_weight = None
+            elif grad.dtype == torch.float16:
+                fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16(total_input, grad_output, grad)
+                grad_weight = None
             else:
-                grad_weight = grad_output.t().matmul(input_parallel)
-            # grad_weight = grad_output.t().matmul(input_parallel)
-            # wait until reduce-scatter finished
-            reducescatter_handle.wait()
+                grad_weight = grad_output.t().matmul(total_input)
+        else:
+            grad_weight = grad_output.t().matmul(total_input)
 
-        return output, grad_weight, grad_bias, None, None, None, None, None
+        grad_bias = grad_output.sum(dim=0) if use_bias else None
+
+        if ctx.async_grad_reduce_scatter:
+            handle.wait()
+
+        return output, grad_weight, grad_bias, None, None, None, None
 
 
 def _ring_as_reducescatter(
@@ -553,7 +517,7 @@ def backward(ctx, grad_output):
         # Convert the tensor shapes to 2D for execution compatibility
         if len(grad_output.shape) > 2:
             grad_output = grad_output.view(-1, grad_output.shape[-1])
-            total_input = total_input.view(-1, total_input.shape[-1])
+            total_input = total_input.reshape(-1, total_input.shape[-1])
         grad_weight = grad_output.t().matmul(total_input)
         grad_bias = grad_output.sum(dim=0) if use_bias else None
 
@@ -611,34 +575,30 @@ class _MatmulWithGatherForwardReduceScatterBackward(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(
-        ctx, input_, weight, bias, process_group, async_grad_reduce_scatter, dim, overlap, ring, fp8_communication
-    ):
+    def forward(ctx, input_, weight, bias, process_group, async_grad_reduce_scatter, dim, ring, fp8_communication):
         ctx.save_for_backward(input_, weight, bias)
         ctx.use_bias = bias is not None
         ctx.process_group = process_group
         ctx.async_grad_reduce_scatter = async_grad_reduce_scatter
         ctx.dim = dim
-        ctx.overlap = overlap
         ctx.fp8_communication = fp8_communication
 
         if ring is True:
-            input_to_gather = {}
-            input_local = {}
-            input_to_gather["input"] = input_
-            input_local["other"] = weight
+            input_to_gather = {"input": input_}
+            input_local = {"other": weight}
 
-            output = _ring_as_gather(
+            output, input_dict = _ring_as_gather(
                 torch.matmul,
                 input_to_gather=input_to_gather,
                 input_local=input_local,
                 process_group=process_group,
                 gather_dim=dim,
             )
+            ctx.gathered_input = input_dict["input"]
 
         else:
             input_parallel = _gather(input_, dim, process_group, fp8_communication, fp8_format="e4m3")
-
+            ctx.gathered_input = input_parallel
             output = torch.matmul(input_parallel, weight)
 
         if bias is not None:
@@ -651,76 +611,39 @@ def backward(ctx, grad_output):
         use_bias = ctx.use_bias
         dim = ctx.dim
         process_group = ctx.process_group
-        overlap = ctx.overlap
-        fp8_communication = ctx.fp8_communication
 
         # In order to be hooked into Gemini's '__torch_function__', adding a view operation to weight and bias. Used in FusedLayerNorm
         weight = weight.view(weight.shape)
         if use_bias:
             bias = bias.view(bias.shape)
 
-        if not overlap:
-            input_parallel = _gather(input_, dim, process_group, fp8_communication, fp8_format="e5m2")
-
-            total_input = input_parallel
-            grad_input = grad_output.matmul(weight.T)
-            grad_output = grad_output.contiguous()
-            # Convert the tensor shapes to 2D for execution compatibility
-            if len(grad_output.shape) > 2:
-                grad_output = grad_output.view(-1, grad_output.shape[-1])
-                total_input = total_input.view(-1, total_input.shape[-1])
-
-            if ctx.async_grad_reduce_scatter:
-                # Asynchronous reduce-scatter
-                input_list = [
-                    item.contiguous() for item in torch.chunk(grad_input, dist.get_world_size(process_group), dim=dim)
-                ]
-                output = torch.empty(
-                    input_.shape, dtype=input_parallel.dtype, device=input_parallel.device
-                ).contiguous()
-                handle = dist.reduce_scatter(output, input_list, group=process_group, async_op=True)
-                # Rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to have
-                # all-reduce scheduled first and have GPU resources allocated
-
-            grad_weight = total_input.t().matmul(grad_output)
-            grad_bias = grad_output.sum(dim=0) if use_bias else None
-
-            if ctx.async_grad_reduce_scatter:
-                handle.wait()
+        input_parallel = ctx.gathered_input
 
-        else:
-            world_size = dist.get_world_size(process_group)
-            tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
-
-            # do all gather in is async way
-            gather_handle = dist.all_gather(tensor_list, input_, group=process_group, async_op=True)
-            # calculate gradient and prepare data asynchronously with all-gather
-            # calculate
-            grad_input = grad_output.matmul(weight.T)
-            grad_output = grad_output.contiguous()
-            # Convert the tensor shapes to 2D for execution compatibility
-            if len(grad_output.shape) > 2:
-                grad_output = grad_output.view(-1, grad_output.shape[-1])
-            grad_bias = grad_output.sum(dim=0) if use_bias else None
-            # prepare data
+        total_input = input_parallel
+        grad_input = grad_output.matmul(weight.T)
+        grad_output = grad_output.contiguous()
+        # Convert the tensor shapes to 2D for execution compatibility
+        if len(grad_output.shape) > 2:
+            grad_output = grad_output.view(-1, grad_output.shape[-1])
+            total_input = total_input.view(-1, total_input.shape[-1])
+
+        if ctx.async_grad_reduce_scatter:
+            # Asynchronous reduce-scatter
             input_list = [
                 item.contiguous() for item in torch.chunk(grad_input, dist.get_world_size(process_group), dim=dim)
             ]
-            output = torch.empty(input_.shape, dtype=input_.dtype, device=input_.device).contiguous()
-            # wait until all-gather finished
-            gather_handle.wait()
+            output = torch.empty(input_.shape, dtype=input_parallel.dtype, device=input_parallel.device).contiguous()
+            handle = dist.reduce_scatter(output, input_list, group=process_group, async_op=True)
+            # Rely on CUDA_DEVICE_MAX_CONNECTIONS=1 to have
+            # all-reduce scheduled first and have GPU resources allocated
+
+        grad_weight = total_input.t().matmul(grad_output)
+        grad_bias = grad_output.sum(dim=0) if use_bias else None
 
-            # do reduce-scatter in async way
-            reducescatter_handle = dist.reduce_scatter(output, input_list, group=process_group, async_op=True)
-            input_parallel = torch.cat(tensor_list, dim=dim).contiguous()
-            # calculate gradient
-            if len(input_parallel.shape) > 2:
-                input_parallel = input_parallel.view(-1, input_parallel.shape[-1])
-            grad_weight = input_parallel.t().matmul(grad_output)
-            # wait until reduce-scatter finished
-            reducescatter_handle.wait()
+        if ctx.async_grad_reduce_scatter:
+            handle.wait()
 
-        return output, grad_weight, grad_bias, None, None, None, None, None, None
+        return output, grad_weight, grad_bias, None, None, None, None, None
 
 
 class _SplitForwardGatherBackward(torch.autograd.Function):
@@ -1050,10 +973,10 @@ def linear_with_async_comm(input_, weight, bias, process_group, async_grad_allre
 
 
 def linear_gather_forward_reducescatter_backward(
-    input_, weight, bias, process_group, async_grad_reduce_scatter, dim, overlap, ring=False
+    input_, weight, bias, process_group, async_grad_reduce_scatter, dim, ring=False
 ):
     return _LinearWithGatherForwardReduceScatterBackward.apply(
-        input_, weight, bias, process_group, async_grad_reduce_scatter, dim, overlap, ring
+        input_, weight, bias, process_group, async_grad_reduce_scatter, dim, ring
     )
 
 
@@ -1070,10 +993,10 @@ def linear_reducescatter_forward_gather_backward(input_, weight, bias=None, proc
 
 
 def matmul_gather_forward_reducescatter_backward(
-    input_, weight, bias, process_group, async_grad_reduce_scatter, dim, overlap, ring=False, fp8_communication=False
+    input_, weight, bias, process_group, async_grad_reduce_scatter, dim, ring=False, fp8_communication=False
 ):
     return _MatmulWithGatherForwardReduceScatterBackward.apply(
-        input_, weight, bias, process_group, async_grad_reduce_scatter, dim, overlap, ring, fp8_communication
+        input_, weight, bias, process_group, async_grad_reduce_scatter, dim, ring, fp8_communication
     )
 
 
diff --git a/colossalai/shardformer/layer/linear.py b/colossalai/shardformer/layer/linear.py
index 52b0e79c64e0..0ba23c296798 100644
--- a/colossalai/shardformer/layer/linear.py
+++ b/colossalai/shardformer/layer/linear.py
@@ -23,17 +23,15 @@
 )
 
 from ._operation import (
-    gather_forward_reducescatter_backward,
     gather_forward_split_backward,
     linear_gather_forward_reducescatter_backward,
     linear_reducescatter_forward_gather_backward,
     linear_with_async_comm,
     reduce_forward,
-    reducescatter_forward_gather_backward,
     split_forward_gather_backward,
 )
 from .parallel_module import PaddingParallelModule, ParallelModule
-from .utils import create_randomizer_with_offset
+from .utils import create_randomizer_with_offset, is_share_sp_tp
 
 __all__ = ["Linear1D_Col", "Linear1D_Row"]
 
@@ -55,7 +53,6 @@ class Linear1D_Col(ParallelModule):
                     to all GPUs, otherwise, every GPU will have its output
                     which is :math:`Y_i = XA_i`, defaults to False
         seq_parallel (`bool`): If set to ``True``, it will use sequence parallel, defaults to False.
-        overlap (`bool`): If set to ``True``, it will overlap input all-gather with gradient computation during backward, defaults to False.
         skip_bias_add (bool): If set to ``True``, it will skip bias add for linear layer,
             which is preserved for kernel fusion, defaults to False
         weight_initializer (`typing.Callable`):
@@ -78,7 +75,6 @@ def __init__(
         gather_output: bool = False,
         seq_parallel_mode: str = None,
         seq_parallel_dim: int = 1,
-        overlap: torch.cuda.Stream = None,
         skip_bias_add: bool = False,
         weight: Optional[Parameter] = None,
         bias_: Optional[Parameter] = None,
@@ -95,7 +91,6 @@ def __init__(
         self.gather_output = gather_output
         self.seq_parallel_mode = seq_parallel_mode
         self.seq_parallel_dim = seq_parallel_dim
-        self.overlap = overlap
         self.skip_bias_add = skip_bias_add
         self.device = device
         self.process_group = process_group
@@ -202,16 +197,15 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
         # Matrix multiply.
         bias = self.bias if not self.skip_bias_add else None
 
-        if self.seq_parallel_mode == "split_gather":
-            input_parallel = gather_forward_reducescatter_backward(
-                input_parallel, self.process_group, self.seq_parallel_dim, fp8_communication=self.fp8_communication
-            )
-            output_parallel = linear_with_async_comm(
-                input_parallel, self.weight, bias, self.process_group, False, fp8_communication=self.fp8_communication
-            )
-        elif self.seq_parallel_mode == "ring":
+        if is_share_sp_tp(self.seq_parallel_mode):
             output_parallel = linear_gather_forward_reducescatter_backward(
-                input_parallel, self.weight, bias, self.process_group, True, self.seq_parallel_dim, self.overlap, True
+                input_parallel,
+                self.weight,
+                bias,
+                self.process_group,
+                True,
+                self.seq_parallel_dim,
+                ring=self.seq_parallel_mode == "ring",
             )
         else:
             output_parallel = linear_with_async_comm(
@@ -428,18 +422,13 @@ def forward(self, input_: Tensor) -> Tensor:
                     handle.wait()
                 output = torch.cat(output_parallel_list, dim=-1)
         else:
-            if self.seq_parallel_mode == "split_gather":
-                output_parallel = F.linear(input_, self.weight)
-                output = reducescatter_forward_gather_backward(
-                    output_parallel, self.process_group, self.seq_parallel_dim, fp8_communication=self.fp8_communication
-                )
-            elif self.seq_parallel_mode == "ring":
+            if is_share_sp_tp(self.seq_parallel_mode):
                 output = linear_reducescatter_forward_gather_backward(
                     input_,
                     self.weight,
                     process_group=self.process_group,
                     dim=self.seq_parallel_dim,
-                    ring=True,
+                    ring=self.seq_parallel_mode == "ring",
                 )
             else:
                 output_parallel = F.linear(input_, self.weight)
@@ -551,7 +540,6 @@ class VocabParallelLMHead1D(Linear1D_Col, PaddingParallelModule):
                     to all GPUs, otherwise, every GPU will have its output
                     which is :math:`Y_i = XA_i`, defaults to False
         seq_parallel (`bool`): If set to ``True``, it will use sequence parallel, defaults to False.
-        overlap (`bool`): If set to ``True``, it will overlap input all-gather with gradient computation during backward, defaults to False.
         skip_bias_add (bool): If set to ``True``, it will skip bias add for linear layer,
             which is preserved for kernel fusion, defaults to False
         weight_initializer (`typing.Callable`):
diff --git a/colossalai/shardformer/layer/qkv_fused_linear.py b/colossalai/shardformer/layer/qkv_fused_linear.py
index a1e25ff3a33c..6e469686b403 100644
--- a/colossalai/shardformer/layer/qkv_fused_linear.py
+++ b/colossalai/shardformer/layer/qkv_fused_linear.py
@@ -25,19 +25,17 @@
 )
 
 from ._operation import (
-    gather_forward_reducescatter_backward,
     linear_gather_forward_reducescatter_backward,
     linear_reducescatter_forward_gather_backward,
     linear_with_async_comm,
     matmul_gather_forward_reducescatter_backward,
     matmul_with_async_comm,
-    reduce_backward,
     reduce_forward,
     reducescatter_forward_gather_backward,
     split_forward_gather_backward,
 )
 from .parallel_module import ParallelModule
-from .utils import create_randomizer_with_offset
+from .utils import create_randomizer_with_offset, is_share_sp_tp
 
 __all__ = ["FusedLinear1D_Col", "FusedLinear1D_Row", "GPT2FusedLinearConv1D_Col", "GPT2FusedLinearConv1D_Row"]
 
@@ -222,10 +220,8 @@ def __init__(
         dtype: torch.dtype = None,
         device: torch.device = None,
         process_group: ProcessGroup = None,
-        async_communication: bool = False,
         gather_output: bool = False,
         seq_parallel_mode: str = None,
-        overlap: bool = False,
         skip_bias_add: bool = False,
         weight: Optional[Parameter] = None,
         bias_: Optional[Parameter] = None,
@@ -240,12 +236,10 @@ def __init__(
         self.out_features = out_features
         self.gather_output = gather_output
         self.seq_parallel_mode = seq_parallel_mode
-        self.overlap = overlap
         self.skip_bias_add = skip_bias_add
         self.device = device
         self.split_sizes = split_sizes
         self.process_group = process_group
-        self.async_communication = async_communication
         self.fp8_communication = fp8_communication
 
         assert (
@@ -370,7 +364,7 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
 
         # Matrix multiply.
         bias = self.bias if not self.skip_bias_add else None
-        if self.seq_parallel_mode == "split_gather":
+        if is_share_sp_tp(self.seq_parallel_mode):
             input_parallel = input_
             output_parallel = matmul_gather_forward_reducescatter_backward(
                 input_parallel,
@@ -379,31 +373,18 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
                 self.process_group,
                 True,
                 1,
-                self.overlap,
-                fp8_communication=self.fp8_communication,
-            )
-        elif self.seq_parallel_mode == "ring":
-            input_parallel = input_
-            output_parallel = matmul_gather_forward_reducescatter_backward(
-                input_parallel,
-                self.weight,
-                bias,
-                self.process_group,
-                True,
-                1,
-                self.overlap,
-                True,
+                ring=self.seq_parallel_mode == "ring",
                 fp8_communication=self.fp8_communication,
             )
         elif self.seq_parallel_mode is None or self.seq_parallel_mode == "ring_attn":
             # Set up backprop all-reduce.
-            input_parallel = reduce_backward(input_, self.process_group)
+            input_parallel = input_
             output_parallel = matmul_with_async_comm(
                 input_parallel,
                 self.weight,
                 bias,
                 self.process_group,
-                self.async_communication,
+                True,
                 fp8_communication=self.fp8_communication,
             )
         else:
@@ -620,7 +601,7 @@ def forward(self, input_: Tensor) -> Tensor:
             if self.seq_parallel_mode is None or self.seq_parallel_mode == "ring_attn":
                 output_parallel = torch.matmul(input_, self.weight)
                 output = reduce_forward(output_parallel, self.process_group, fp8_communication=self.fp8_communication)
-            elif self.seq_parallel_mode == "split_gather":
+            elif is_share_sp_tp(self.seq_parallel_mode):
                 output_parallel = torch.matmul(input_, self.weight)
                 output = reducescatter_forward_gather_backward(
                     output_parallel,
@@ -628,13 +609,6 @@ def forward(self, input_: Tensor) -> Tensor:
                     1,
                     self.fp8_communication,
                 )
-            elif self.seq_parallel_mode == "ring":
-                output_parallel = torch.matmul(input_, self.weight)
-                output = reducescatter_forward_gather_backward(
-                    output_parallel,
-                    self.process_group,
-                    1,
-                )
             else:
                 raise NotImplementedError(f"seq_parallel_mode={self.seq_parallel_mode} is not supported!")
 
@@ -691,7 +665,6 @@ def __init__(
         gather_output: bool = False,
         seq_parallel_mode: str = None,
         seq_parallel_dim: int = 1,
-        overlap: torch.cuda.Stream = None,
         skip_bias_add: bool = False,
         weight: Optional[Parameter] = None,
         bias_: Optional[Parameter] = None,
@@ -706,7 +679,6 @@ def __init__(
         self.gather_output = gather_output
         self.seq_parallel_mode = seq_parallel_mode
         self.seq_parallel_dim = seq_parallel_dim
-        self.overlap = overlap
         self.skip_bias_add = skip_bias_add
         self.device = device
         self.split_sizes = split_sizes
@@ -830,16 +802,15 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
         # Matrix multiply.
         bias = self.bias if not self.skip_bias_add else None
 
-        if self.seq_parallel_mode == "split_gather":
-            input_parallel = gather_forward_reducescatter_backward(
-                input_parallel, self.process_group, self.seq_parallel_dim, fp8_communication=self.fp8_communication
-            )
-            output_parallel = linear_with_async_comm(
-                input_parallel, self.weight, bias, self.process_group, False, fp8_communication=self.fp8_communication
-            )
-        elif self.seq_parallel_mode == "ring":
+        if is_share_sp_tp(self.seq_parallel_mode):
             output_parallel = linear_gather_forward_reducescatter_backward(
-                input_parallel, self.weight, bias, self.process_group, True, self.seq_parallel_dim, self.overlap, True
+                input_parallel,
+                self.weight,
+                bias,
+                self.process_group,
+                True,
+                self.seq_parallel_dim,
+                ring=self.seq_parallel_mode == "ring",
             )
         else:
             output_parallel = linear_with_async_comm(
@@ -1031,18 +1002,13 @@ def forward(self, input_: Tensor) -> Tensor:
             )
             input_ = split_forward_gather_backward_fused_qkv(input_, self.split_sizes, self.process_group)
 
-        if self.seq_parallel_mode == "split_gather":
-            output_parallel = F.linear(input_, self.weight)
-            output = reducescatter_forward_gather_backward(
-                output_parallel, self.process_group, self.seq_parallel_dim, fp8_communication=self.fp8_communication
-            )
-        elif self.seq_parallel_mode == "ring":
+        if is_share_sp_tp(self.seq_parallel_mode):
             output = linear_reducescatter_forward_gather_backward(
                 input_,
                 self.weight,
                 process_group=self.process_group,
                 dim=self.seq_parallel_dim,
-                ring=True,
+                ring=self.seq_parallel_mode == "ring",
             )
         else:
             output_parallel = F.linear(input_, self.weight)
diff --git a/colossalai/shardformer/policies/bert.py b/colossalai/shardformer/policies/bert.py
index 4c33e14bc2ab..09673d3967b6 100644
--- a/colossalai/shardformer/policies/bert.py
+++ b/colossalai/shardformer/policies/bert.py
@@ -73,7 +73,6 @@ def module_policy(self):
             )
             sp_mode = "split_gather"
 
-        overlap = self.shard_config.enable_sequence_overlap
         sp_partial_derived = sp_mode == "split_gather"
 
         if self.shard_config.enable_tensor_parallelism:
@@ -97,7 +96,6 @@ def module_policy(self):
                         target_module=col_nn.Linear1D_Col,
                         kwargs={
                             "seq_parallel_mode": sp_mode,
-                            "overlap": overlap,
                             "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
@@ -106,7 +104,6 @@ def module_policy(self):
                         target_module=col_nn.Linear1D_Col,
                         kwargs={
                             "seq_parallel_mode": sp_mode,
-                            "overlap": overlap,
                             "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
@@ -115,7 +112,6 @@ def module_policy(self):
                         target_module=col_nn.Linear1D_Col,
                         kwargs={
                             "seq_parallel_mode": sp_mode,
-                            "overlap": overlap,
                             "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
@@ -140,7 +136,6 @@ def module_policy(self):
                         target_module=col_nn.Linear1D_Col,
                         kwargs={
                             "seq_parallel_mode": sp_mode,
-                            "overlap": overlap,
                             "skip_bias_add": self.enable_bias_gelu_fused,
                             "fp8_communication": self.shard_config.fp8_communication,
                         },
diff --git a/colossalai/shardformer/policies/bloom.py b/colossalai/shardformer/policies/bloom.py
index a43ac02d0cd7..7c6259e850c2 100644
--- a/colossalai/shardformer/policies/bloom.py
+++ b/colossalai/shardformer/policies/bloom.py
@@ -57,7 +57,6 @@ def module_policy(self):
             )
             sp_mode = "split_gather"
 
-        overlap = self.shard_config.enable_sequence_overlap
         sp_partial_derived = sp_mode == "split_gather"
 
         if self.shard_config.enable_tensor_parallelism:
@@ -78,7 +77,6 @@ def module_policy(self):
                         target_module=col_nn.Linear1D_Col,
                         kwargs={
                             "seq_parallel_mode": sp_mode,
-                            "overlap": overlap,
                             "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
@@ -99,7 +97,6 @@ def module_policy(self):
                         target_module=col_nn.Linear1D_Col,
                         kwargs={
                             "seq_parallel_mode": sp_mode,
-                            "overlap": overlap,
                             "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
diff --git a/colossalai/shardformer/policies/chatglm2.py b/colossalai/shardformer/policies/chatglm2.py
index 1b7d2db85991..c003570a0582 100644
--- a/colossalai/shardformer/policies/chatglm2.py
+++ b/colossalai/shardformer/policies/chatglm2.py
@@ -67,7 +67,6 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                 f"For ChatGLM2, sequence parallelism doesn't support mode {sp_mode} yet, will set to be split_gather"
             )
             sp_mode = "split_gather"
-        overlap = self.shard_config.enable_sequence_overlap
         sp_partial_derived = sp_mode in ["split_gather"]
 
         if sp_mode == "all_to_all":
@@ -127,7 +126,6 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                         kwargs={
                             "seq_parallel_mode": sp_mode,
                             "seq_parallel_dim": 0,
-                            "overlap": overlap,
                             "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
diff --git a/colossalai/shardformer/policies/gpt2.py b/colossalai/shardformer/policies/gpt2.py
index faacf91b265a..08accaaea279 100644
--- a/colossalai/shardformer/policies/gpt2.py
+++ b/colossalai/shardformer/policies/gpt2.py
@@ -65,7 +65,6 @@ def module_policy(self):
                 f"For GPT2, sequence parallelism is currently not support mode {sp_mode}, will set to be split_gather"
             )
             self.shard_config.sequence_parallelism_mode = sp_mode = "split_gather"
-        overlap = self.shard_config.enable_sequence_overlap
         sp_partial_derived = sp_mode in ["split_gather", "ring"]
         use_flash_attention = self.shard_config.enable_flash_attention
         if self.shard_config.enable_tensor_parallelism:
@@ -94,7 +93,6 @@ def module_policy(self):
                         kwargs={
                             "split_sizes": [self.model.config.hidden_size] * 3,
                             "seq_parallel_mode": sp_mode,
-                            "overlap": overlap,
                             "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
@@ -109,7 +107,6 @@ def module_policy(self):
                         kwargs={
                             "split_sizes": [self.model.config.n_inner or 4 * self.model.config.hidden_size],
                             "seq_parallel_mode": sp_mode,
-                            "overlap": overlap,
                             "skip_bias_add": self.enable_bias_gelu_fused,
                             "fp8_communication": self.shard_config.fp8_communication,
                         },
diff --git a/colossalai/shardformer/policies/gptj.py b/colossalai/shardformer/policies/gptj.py
index 6f0c8803c3f1..9fcca1385f79 100644
--- a/colossalai/shardformer/policies/gptj.py
+++ b/colossalai/shardformer/policies/gptj.py
@@ -51,7 +51,6 @@ def module_policy(self):
             self.shard_config.enable_sequence_parallelism = False
             warnings.warn("GPTJ doesn't support sequence parallelism now, will ignore the sequence parallelism flag.")
 
-        overlap = self.shard_config.enable_sequence_overlap
         if self.shard_config.enable_tensor_parallelism:
             assert (
                 self.model.config.num_attention_heads % self.shard_config.tensor_parallel_size == 0
@@ -76,7 +75,6 @@ def module_policy(self):
                         suffix="attn.k_proj",
                         target_module=col_nn.Linear1D_Col,
                         kwargs={
-                            "overlap": overlap,
                             "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
@@ -84,7 +82,6 @@ def module_policy(self):
                         suffix="attn.q_proj",
                         target_module=col_nn.Linear1D_Col,
                         kwargs={
-                            "overlap": overlap,
                             "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
@@ -92,7 +89,6 @@ def module_policy(self):
                         suffix="attn.v_proj",
                         target_module=col_nn.Linear1D_Col,
                         kwargs={
-                            "overlap": overlap,
                             "fp8_communication": self.shard_config.fp8_communication,
                         },
                     ),
diff --git a/colossalai/shardformer/shard/shard_config.py b/colossalai/shardformer/shard/shard_config.py
index 1219119bb095..911226e5c749 100644
--- a/colossalai/shardformer/shard/shard_config.py
+++ b/colossalai/shardformer/shard/shard_config.py
@@ -26,7 +26,6 @@ class ShardConfig:
         enable_flash_attention (bool, optional): Whether to switch on flash attention. Defaults to False.
         enable_jit_fused (bool, optional): Whether to switch on JIT fused operators. Defaults to False.
         enable_sequence_parallelism (bool): Whether to turn on sequence parallelism, which partitions non-tensor-parallel regions along the sequence dimension. Defaults to False.
-        enable_sequence_overlap (bool): Whether to turn on sequence overlap, which overlap the computation and communication in sequence parallelism. It can only be used when enable_sequence_parallelism is True. Defaults to False.
         gradient_checkpoint_config (Optional[GradientCheckpointConfig]): The gradient checkpoint config. Defaults to None.
         enable_all_optimization (bool): Whether to turn on all optimization tools including 'fused normalization', 'flash attention', 'JIT fused operators', 'sequence parallelism' and 'sequence overlap'. Defaults to False.
         fp8_communication (bool, optional): Whether to enable fp8 communication in model parallelism. Defaults to False.
@@ -44,7 +43,6 @@ class ShardConfig:
     enable_jit_fused: bool = False
     enable_sequence_parallelism: bool = False
     sequence_parallelism_mode: str = None
-    enable_sequence_overlap: bool = False
     parallel_output: bool = True
     make_vocab_size_divisible_by: int = 64
     gradient_checkpoint_config: Optional[GradientCheckpointConfig] = None
@@ -84,24 +82,12 @@ def __post_init__(self):
                 assert (
                     self.enable_tensor_parallelism
                 ), f"sequence parallelism mode {self.sequence_parallelism_mode} can only be used when enable_tensor_parallelism is True"
-            elif self.sequence_parallelism_mode in ["all_to_all"]:
-                # assert (
-                #     not self.enable_tensor_parallelism
-                # ), f"sequence parallelism mode {self.sequence_parallelism_mode} can only be used when enable_tensor_parallelism is False"
-                if self.enable_sequence_overlap:
-                    self.enable_sequence_overlap = False
-                    warnings.warn(
-                        f"The enable_sequence_overlap flag will be ignored in sequence parallelism mode {self.sequence_parallelism_mode}"
-                    )
         else:
             if self.sequence_parallelism_mode:
                 self.sequence_parallelism_mode = None
                 warnings.warn(
                     f"The sequence_parallelism_mode will be ignored when enable_sequence_parallelism is False"
                 )
-            assert (
-                not self.enable_sequence_overlap
-            ), f"enable_sequence_overlap can only be set to True when enable_sequence_parallelism is True"
 
         # get the tensor parallel size
         if not self.enable_tensor_parallelism:
@@ -134,4 +120,3 @@ def _turn_on_all_optimization(self):
         # This can cause non-in-place param sharding when used without ZeRO.
         # It may also slow down training when seq len is small. Plz enable manually.
         # self.enable_sequence_parallelism = True
-        # self.enable_sequence_overlap = True
diff --git a/tests/test_shardformer/test_layer/test_gpt2_qkv_fused_linear_1d.py b/tests/test_shardformer/test_layer/test_gpt2_qkv_fused_linear_1d.py
index 923075e0e83d..a45beb77108f 100644
--- a/tests/test_shardformer/test_layer/test_gpt2_qkv_fused_linear_1d.py
+++ b/tests/test_shardformer/test_layer/test_gpt2_qkv_fused_linear_1d.py
@@ -41,7 +41,7 @@ def forward(self, x):
         return x
 
 
-def check_linear_conv_1d_col(lazy_init: bool, seq_parallel_mode: str, overlap: bool):
+def check_linear_conv_1d_col(lazy_init: bool, seq_parallel_mode: str):
     ctx = LazyInitContext() if lazy_init else nullcontext()
     linear = Conv1D(192, 48).cuda()
     with ctx:
@@ -52,7 +52,6 @@ def check_linear_conv_1d_col(lazy_init: bool, seq_parallel_mode: str, overlap: b
         gather_output=True,
         seq_parallel_mode=seq_parallel_mode,
         split_sizes=[64] * 3,
-        overlap=overlap,
     )
 
     assert linear.weight.shape == torch.Size([48, 192])
@@ -121,9 +120,8 @@ def check_linear_conv_1d_row(lazy_init: bool, seq_parallel_mode: bool):
 
 @parameterize("lazy_init", [False, True])
 @parameterize("seq_parallel_mode", ["split_gather", None])
-@parameterize("overlap", [True])
-def check_gpt2_qkv_fused_linear_1d(lazy_init: bool, seq_parallel_mode: bool, overlap: bool):
-    check_linear_conv_1d_col(lazy_init, seq_parallel_mode, overlap)
+def check_gpt2_qkv_fused_linear_1d(lazy_init: bool, seq_parallel_mode: bool):
+    check_linear_conv_1d_col(lazy_init, seq_parallel_mode)
     check_linear_conv_1d_row(lazy_init, seq_parallel_mode)
 
 

From 0002ae5956bfbab15810ce95ea8db70865fb80f1 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Fri, 11 Oct 2024 14:16:21 +0800
Subject: [PATCH 114/167] fix

---
 colossalai/shardformer/layer/attn.py | 37 +++++++++++++++++-----------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 7c25bee1a3a2..7af8271eb734 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -476,24 +476,31 @@ def get_double_ring_groups(sp_group, tp_group, inner_ring_size=None):
         rank = dist.get_rank()
 
         num_ring_size = world_size // num_rings
-        num_inner_group = num_ring_size // inner_ring_size
 
         if tp_size > 1:
+            ranks = []
             for i in range(num_rings):
-                for j in range(num_inner_group):
-                    # find inner ring group in one sp groups
-                    start = j + i * num_ring_size
-                    ranks = list(range(start, start + tp_size * inner_ring_size, tp_size))
-                    group = dist.new_group(ranks)
-                    if rank in ranks:
-                        inner_ring_group = group
-            for i in range(num_rings):
-                for j in range(num_inner_group):
-                    start = j + (i * num_inner_group)
-                    ranks = list(range(start, start + num_ring_size + 1, num_ring_size))
-                    group = dist.new_group(ranks)
-                    if rank in ranks:
-                        inter_ring_group = group
+                start = i * num_ring_size
+                end = (i + 1) * num_ring_size
+                for idx in range(start, end):
+                    inner_rank = []
+                    for k in range(inner_ring_size):
+                        current_num = idx + k * tp_size
+                        if current_num >= end:
+                            break
+                        inner_rank.append(current_num)
+                    if len(inner_rank) == inner_ring_size and inner_rank not in ranks:
+                        ranks.append(inner_rank)
+                        group = dist.new_group(inner_rank)
+                        if rank in inner_rank:
+                            inner_ring_group = group
+
+            for i in range(num_ring_size):
+                inter_rank = [i + j * num_ring_size for j in range(num_rings)]
+                group = dist.new_group(inter_rank)
+                if rank in inter_rank:
+                    inter_ring_group = group
+
         else:
             # Create inner ring groups
             for i in range(inner_ring_size):

From 1507a7528fd86fba5a86427ff7e1283709f863d0 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Fri, 11 Oct 2024 06:20:34 +0000
Subject: [PATCH 115/167] fix

---
 colossalai/shardformer/layer/attn.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 7af8271eb734..411b4dbcc83a 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -478,6 +478,7 @@ def get_double_ring_groups(sp_group, tp_group, inner_ring_size=None):
         num_ring_size = world_size // num_rings
 
         if tp_size > 1:
+            # Create inner ring groups
             ranks = []
             for i in range(num_rings):
                 start = i * num_ring_size
@@ -494,7 +495,7 @@ def get_double_ring_groups(sp_group, tp_group, inner_ring_size=None):
                         group = dist.new_group(inner_rank)
                         if rank in inner_rank:
                             inner_ring_group = group
-
+            # Create inter ring groups
             for i in range(num_ring_size):
                 inter_rank = [i + j * num_ring_size for j in range(num_rings)]
                 group = dist.new_group(inter_rank)

From 0ca16d5cbea6d482f0fcedc62d0db8592f41fe9d Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Fri, 11 Oct 2024 07:32:43 +0000
Subject: [PATCH 116/167] [fix] fix llama, mixtral benchmark zbv loss none bug;
 update mixtral & llama policy and modeling;

---
 .../pipeline/schedule/zero_bubble_pp.py       |  19 +-
 colossalai/shardformer/modeling/mixtral.py    | 488 +++---------------
 colossalai/shardformer/policies/mixtral.py    |  16 +-
 examples/language/llama/benchmark.py          |  38 +-
 examples/language/mixtral/benchmark.py        |   9 +-
 5 files changed, 137 insertions(+), 433 deletions(-)

diff --git a/colossalai/pipeline/schedule/zero_bubble_pp.py b/colossalai/pipeline/schedule/zero_bubble_pp.py
index 5c25c5bfaa80..c928a207c405 100644
--- a/colossalai/pipeline/schedule/zero_bubble_pp.py
+++ b/colossalai/pipeline/schedule/zero_bubble_pp.py
@@ -432,7 +432,6 @@ def forward_step(
             internal_inputs = {} if input_obj is None else input_obj
             internal_inputs["stage_index"] = self.stage_manager.stage_indices[model_chunk_id]
             output_obj = model_forward(model_chunk, micro_batch, internal_inputs)
-
             # last layer in model
             if model_chunk_id == 1 and self.stage_manager.is_first_stage(ignore_chunk=True):
                 loss = criterion(output_obj, micro_batch) / self.num_microbatch
@@ -500,12 +499,18 @@ def backward_b_step(
         output_obj_ = [v for v in output_obj_ if isinstance(v, torch.Tensor) or v is None]
         output_obj_grad_ = [v for v in output_obj_grad_ if isinstance(v, torch.Tensor) or v is None]
 
-        optimizer.backward_by_grad(
-            tensor=output_obj_,
-            grad=output_obj_grad_,
-            inputs=input_obj_,
-            retain_graph=True,
-        )
+        try:
+            ctx = optimizer.no_sync()
+        except AttributeError:
+            ctx = model_chunk.no_sync()
+
+        with ctx:
+            optimizer.backward_by_grad(
+                tensor=output_obj_,
+                grad=output_obj_grad_,
+                inputs=input_obj_,
+                retain_graph=True,
+            )
 
         # Format output_obj_grad
         input_obj_grad = {}
diff --git a/colossalai/shardformer/modeling/mixtral.py b/colossalai/shardformer/modeling/mixtral.py
index 3709af54c486..a783b5c5eb26 100644
--- a/colossalai/shardformer/modeling/mixtral.py
+++ b/colossalai/shardformer/modeling/mixtral.py
@@ -267,98 +267,25 @@ def mixtral_model_forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        if stage_manager.is_interleave:
-            if stage_manager.use_zbv:
-                # zbv
-                if stage_manager.is_first_stage(ignore_chunk=True) and stage_manager.model_chunk_id == 0:
-                    # retrieve input_ids and inputs_embeds
-                    if input_ids is not None and inputs_embeds is not None:
-                        raise ValueError(
-                            "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
-                        )
-                    elif input_ids is not None:
-                        batch_size, seq_length = input_ids.shape
-                    elif inputs_embeds is not None:
-                        batch_size, seq_length, _ = inputs_embeds.shape
-                    else:
-                        raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-                    device = input_ids.device if input_ids is not None else inputs_embeds.device
-                    if inputs_embeds is None:
-                        inputs_embeds = self.embed_tokens(input_ids)
-                    hidden_states = inputs_embeds
-                else:
-                    input_shape = hidden_states.shape[:-1]
-                    batch_size, seq_length = input_shape
-                    device = hidden_states.device
+        # retrieve input_ids and inputs_embeds
+        if stage_manager.is_first_stage():
+            # retrieve input_ids and inputs_embeds
+            if input_ids is not None and inputs_embeds is not None:
+                raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+            elif input_ids is not None:
+                batch_size, seq_length = input_ids.shape
+            elif inputs_embeds is not None:
+                batch_size, seq_length, _ = inputs_embeds.shape
             else:
-                # interleaved
-                if stage_manager.is_first_stage(ignore_chunk=True):
-                    # retrieve input_ids and inputs_embeds
-                    if input_ids is not None and inputs_embeds is not None:
-                        raise ValueError(
-                            "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
-                        )
-                    elif input_ids is not None:
-                        batch_size, seq_length = input_ids.shape
-                    elif inputs_embeds is not None:
-                        batch_size, seq_length, _ = inputs_embeds.shape
-                    else:
-                        raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-                    device = input_ids.device if input_ids is not None else inputs_embeds.device
-                    if inputs_embeds is None:
-                        inputs_embeds = self.embed_tokens(input_ids)
-                    hidden_states = inputs_embeds
-                else:
-                    input_shape = hidden_states.shape[:-1]
-                    batch_size, seq_length = input_shape
-                    device = hidden_states.device
+                raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            if inputs_embeds is None:
+                inputs_embeds = self.embed_tokens(input_ids)
+            hidden_states = inputs_embeds
         else:
-            # 1f1b or None
-            if stage_manager.is_first_stage():  # No ignore_chunk=True for 1f1b
-                # retrieve input_ids and inputs_embeds
-                if input_ids is not None and inputs_embeds is not None:
-                    raise ValueError(
-                        "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
-                    )
-                elif input_ids is not None:
-                    batch_size, seq_length = input_ids.shape
-                elif inputs_embeds is not None:
-                    batch_size, seq_length, _ = inputs_embeds.shape
-                else:
-                    raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-                device = input_ids.device if input_ids is not None else inputs_embeds.device
-                if inputs_embeds is None:
-                    inputs_embeds = self.embed_tokens(input_ids)
-                hidden_states = inputs_embeds
-            else:
-                input_shape = hidden_states.shape[:-1]
-                batch_size, seq_length = input_shape
-                device = hidden_states.device
-
-        #######
-        # Attention, we support consider 1f1b, interleaved, zbv
-        #######
-
-        # # retrieve input_ids and inputs_embeds
-        # print(f"model_chunk_id {stage_manager.model_chunk_id} stage_manager {stage_manager.stage}")
-        # if stage_manager.is_first_stage():
-        #     # retrieve input_ids and inputs_embeds
-        #     if input_ids is not None and inputs_embeds is not None:
-        #         raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        #     elif input_ids is not None:
-        #         batch_size, seq_length = input_ids.shape
-        #     elif inputs_embeds is not None:
-        #         batch_size, seq_length, _ = inputs_embeds.shape
-        #     else:
-        #         raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-        #     device = input_ids.device if input_ids is not None else inputs_embeds.device
-        #     if inputs_embeds is None:
-        #         inputs_embeds = self.embed_tokens(input_ids)
-        #     hidden_states = inputs_embeds
-        # else:
-        #     input_shape = hidden_states.shape[:-1]
-        #     batch_size, seq_length = input_shape
-        #     device = hidden_states.device
+            input_shape = hidden_states.shape[:-1]
+            batch_size, seq_length = input_shape
+            device = hidden_states.device
 
         seq_length_with_past = seq_length
         past_key_values_length = 0
@@ -462,22 +389,8 @@ def custom_forward(*inputs):
             if output_router_logits:
                 all_router_logits += (layer_outputs[-1],)
 
-        #######
-        # Attention, we support consider 1f1b, interleaved, zbv
-        #######
-        if stage_manager.is_interleave:
-            if stage_manager.use_zbv:
-                if stage_manager.is_first_stage(ignore_chunk=True) and stage_manager.model_chunk_id == 1:
-                    hidden_states = self.norm(hidden_states)
-            else:
-                if stage_manager.is_last_stage(ignore_chunk=True):
-                    hidden_states = self.norm(hidden_states)
-        else:
-            if stage_manager.is_last_stage():  # No ignore_chunk=True for 1f1b
-                hidden_states = self.norm(hidden_states)
-
-        # if stage_manager.is_last_stage():
-        #     hidden_states = self.norm(hidden_states)
+        if stage_manager.is_last_stage():
+            hidden_states = self.norm(hidden_states)
 
         # add hidden states from the last decoder layer
         if output_hidden_states:
@@ -487,113 +400,30 @@ def custom_forward(*inputs):
         if output_router_logits and past_router_logits is not None:
             all_router_logits = past_router_logits + all_router_logits
 
-        #######
-        # Attention, we support consider 1f1b, interleaved, zbv
-        #######
-        if stage_manager.is_interleave:
-            if stage_manager.use_zbv:
-                # zbv
-                if stage_manager.is_first_stage(ignore_chunk=True) and stage_manager.model_chunk_id == 1:
-                    if not return_dict:
-                        return tuple(
-                            v
-                            for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
-                            if v is not None
-                        )
-                    return MoeModelOutputWithPast(
-                        last_hidden_state=hidden_states,
-                        past_key_values=next_cache,
-                        hidden_states=all_hidden_states,
-                        attentions=all_self_attns,
-                        router_logits=all_router_logits,
-                    )
-                else:
-                    if output_router_logits:
-                        return {
-                            "hidden_states": hidden_states,
-                            "past_router_logits": all_router_logits,
-                        }
-                    else:
-                        return {
-                            "hidden_states": hidden_states,
-                        }
-            else:
-                # interlearved
-                if stage_manager.is_last_stage(ignore_chunk=True):
-                    if not return_dict:
-                        return tuple(
-                            v
-                            for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
-                            if v is not None
-                        )
-                    return MoeModelOutputWithPast(
-                        last_hidden_state=hidden_states,
-                        past_key_values=next_cache,
-                        hidden_states=all_hidden_states,
-                        attentions=all_self_attns,
-                        router_logits=all_router_logits,
-                    )
-                else:
-                    if output_router_logits:
-                        return {
-                            "hidden_states": hidden_states,
-                            "past_router_logits": all_router_logits,
-                        }
-                    else:
-                        return {
-                            "hidden_states": hidden_states,
-                        }
-        else:
-            # 1f1b or other
-            if stage_manager.is_last_stage():
-                if not return_dict:
-                    return tuple(
-                        v
-                        for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
-                        if v is not None
-                    )
-                return MoeModelOutputWithPast(
-                    last_hidden_state=hidden_states,
-                    past_key_values=next_cache,
-                    hidden_states=all_hidden_states,
-                    attentions=all_self_attns,
-                    router_logits=all_router_logits,
+        if stage_manager.is_last_stage():
+            if not return_dict:
+                return tuple(
+                    v
+                    for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
+                    if v is not None
                 )
+            return MoeModelOutputWithPast(
+                last_hidden_state=hidden_states,
+                past_key_values=next_cache,
+                hidden_states=all_hidden_states,
+                attentions=all_self_attns,
+                router_logits=all_router_logits,
+            )
+        else:
+            if output_router_logits:
+                return {
+                    "hidden_states": hidden_states,
+                    "past_router_logits": all_router_logits,
+                }
             else:
-                if output_router_logits:
-                    return {
-                        "hidden_states": hidden_states,
-                        "past_router_logits": all_router_logits,
-                    }
-                else:
-                    return {
-                        "hidden_states": hidden_states,
-                    }
-
-        # if stage_manager.is_last_stage():
-        #     if not return_dict:
-        #         return tuple(
-        #             v
-        #             for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
-        #             if v is not None
-        #         )
-        #     return MoeModelOutputWithPast(
-        #         last_hidden_state=hidden_states,
-        #         past_key_values=next_cache,
-        #         hidden_states=all_hidden_states,
-        #         attentions=all_self_attns,
-        #         router_logits=all_router_logits,
-        #     )
-        # else:
-        #     if output_router_logits:
-        #         return {
-        #             "hidden_states": hidden_states,
-        #             "past_router_logits": all_router_logits,
-        #         }
-        #     else:
-        #         return {
-        #             "hidden_states": hidden_states,
-        #         }
+                return {
+                    "hidden_states": hidden_states,
+                }
 
     @staticmethod
     def mixtral_for_causal_lm_forward(
@@ -679,201 +509,51 @@ def mixtral_for_causal_lm_forward(
         )
         past_key_values = None
 
-        #######
-        # Attention, we support consider 1f1b, interleaved, zbv
-        #######
-        if stage_manager.is_interleave:
-            if stage_manager.use_zbv:
-                # zbv
-                if stage_manager.is_first_stage(ignore_chunk=True) and stage_manager.model_chunk_id == 1:
-                    hidden_states = outputs[0]
-                    logits = self.lm_head(hidden_states)
-                    logits = logits.float()
-
-                    loss = None
-                    if labels is not None:
-                        # Shift so that tokens < n predict n
-                        shift_logits = logits[..., :-1, :].contiguous()
-                        shift_labels = labels[..., 1:].contiguous()
-                        # Flatten the tokens
-                        loss_fct = CrossEntropyLoss()
-                        shift_logits = shift_logits.view(-1, self.config.vocab_size)
-                        shift_labels = shift_labels.view(-1)
-                        # Enable model parallelism
-                        shift_labels = shift_labels.to(shift_logits.device)
-                        loss = loss_fct(shift_logits, shift_labels)
-
-                    aux_loss = None
-                    if output_router_logits:
-                        aux_loss = load_balancing_loss_func(outputs[-1], self.num_experts, self.num_experts_per_tok)
-                        if labels is not None:
-                            loss += self.router_aux_loss_coef * aux_loss
-
-                    if not return_dict:
-                        output = (logits,) + outputs[1:]
-                        if output_router_logits:
-                            output = (aux_loss,) + output
-                        return (loss,) + output if loss is not None else output
-
-                    return MoeCausalLMOutputWithPast(
-                        loss=loss,
-                        aux_loss=aux_loss,
-                        logits=logits,
-                        past_key_values=None,
-                        hidden_states=outputs[0],
-                        attentions=None,
-                        router_logits=outputs[-1],
-                    )
-                else:
-                    out = {}
-                    hidden_states = outputs.get("hidden_states")
-                    out["hidden_states"] = hidden_states
-                    if output_router_logits:
-                        out["past_router_logits"] = outputs["past_router_logits"]
-                    return out
-            else:
-                # interleaved
-                if stage_manager.is_last_stage(ignore_chunk=True):
-                    hidden_states = outputs[0]
-                    logits = self.lm_head(hidden_states)
-                    logits = logits.float()
-
-                    loss = None
-                    if labels is not None:
-                        # Shift so that tokens < n predict n
-                        shift_logits = logits[..., :-1, :].contiguous()
-                        shift_labels = labels[..., 1:].contiguous()
-                        # Flatten the tokens
-                        loss_fct = CrossEntropyLoss()
-                        shift_logits = shift_logits.view(-1, self.config.vocab_size)
-                        shift_labels = shift_labels.view(-1)
-                        # Enable model parallelism
-                        shift_labels = shift_labels.to(shift_logits.device)
-                        loss = loss_fct(shift_logits, shift_labels)
-
-                    aux_loss = None
-                    if output_router_logits:
-                        aux_loss = load_balancing_loss_func(outputs[-1], self.num_experts, self.num_experts_per_tok)
-                        if labels is not None:
-                            loss += self.router_aux_loss_coef * aux_loss
-
-                    if not return_dict:
-                        output = (logits,) + outputs[1:]
-                        if output_router_logits:
-                            output = (aux_loss,) + output
-                        return (loss,) + output if loss is not None else output
-
-                    return MoeCausalLMOutputWithPast(
-                        loss=loss,
-                        aux_loss=aux_loss,
-                        logits=logits,
-                        past_key_values=None,
-                        hidden_states=outputs[0],
-                        attentions=None,
-                        router_logits=outputs[-1],
-                    )
-                else:
-                    out = {}
-                    hidden_states = outputs.get("hidden_states")
-                    out["hidden_states"] = hidden_states
-                    if output_router_logits:
-                        out["past_router_logits"] = outputs["past_router_logits"]
-                    return out
-        else:
-            # 1f1b or otherwise
-            if stage_manager.is_last_stage():
-                hidden_states = outputs[0]
-                logits = self.lm_head(hidden_states)
-                logits = logits.float()
-
-                loss = None
+        if stage_manager.is_last_stage():
+            hidden_states = outputs[0]
+            logits = self.lm_head(hidden_states)
+            logits = logits.float()
+            loss = None
+            if labels is not None:
+                # Shift so that tokens < n predict n
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+                # Flatten the tokens
+                loss_fct = CrossEntropyLoss()
+                shift_logits = shift_logits.view(-1, self.config.vocab_size)
+                shift_labels = shift_labels.view(-1)
+                # Enable model parallelism
+                shift_labels = shift_labels.to(shift_logits.device)
+                loss = loss_fct(shift_logits, shift_labels)
+
+            aux_loss = None
+            if output_router_logits:
+                aux_loss = load_balancing_loss_func(outputs[-1], self.num_experts, self.num_experts_per_tok)
                 if labels is not None:
-                    # Shift so that tokens < n predict n
-                    shift_logits = logits[..., :-1, :].contiguous()
-                    shift_labels = labels[..., 1:].contiguous()
-                    # Flatten the tokens
-                    loss_fct = CrossEntropyLoss()
-                    shift_logits = shift_logits.view(-1, self.config.vocab_size)
-                    shift_labels = shift_labels.view(-1)
-                    # Enable model parallelism
-                    shift_labels = shift_labels.to(shift_logits.device)
-                    loss = loss_fct(shift_logits, shift_labels)
-
-                aux_loss = None
-                if output_router_logits:
-                    aux_loss = load_balancing_loss_func(outputs[-1], self.num_experts, self.num_experts_per_tok)
-                    if labels is not None:
-                        loss += self.router_aux_loss_coef * aux_loss
-
-                if not return_dict:
-                    output = (logits,) + outputs[1:]
-                    if output_router_logits:
-                        output = (aux_loss,) + output
-                    return (loss,) + output if loss is not None else output
-
-                return MoeCausalLMOutputWithPast(
-                    loss=loss,
-                    aux_loss=aux_loss,
-                    logits=logits,
-                    past_key_values=None,
-                    hidden_states=outputs[0],
-                    attentions=None,
-                    router_logits=outputs[-1],
-                )
-            else:
-                out = {}
-                hidden_states = outputs.get("hidden_states")
-                out["hidden_states"] = hidden_states
+                    loss += self.router_aux_loss_coef * aux_loss
+
+            if not return_dict:
+                output = (logits,) + outputs[1:]
                 if output_router_logits:
-                    out["past_router_logits"] = outputs["past_router_logits"]
-                return out
-
-        # if stage_manager.is_last_stage():
-        #     hidden_states = outputs[0]
-        #     logits = self.lm_head(hidden_states)
-        #     logits = logits.float()
-
-        #     loss = None
-        #     if labels is not None:
-        #         # Shift so that tokens < n predict n
-        #         shift_logits = logits[..., :-1, :].contiguous()
-        #         shift_labels = labels[..., 1:].contiguous()
-        #         # Flatten the tokens
-        #         loss_fct = CrossEntropyLoss()
-        #         shift_logits = shift_logits.view(-1, self.config.vocab_size)
-        #         shift_labels = shift_labels.view(-1)
-        #         # Enable model parallelism
-        #         shift_labels = shift_labels.to(shift_logits.device)
-        #         loss = loss_fct(shift_logits, shift_labels)
-
-        #     aux_loss = None
-        #     if output_router_logits:
-        #         aux_loss = load_balancing_loss_func(outputs[-1], self.num_experts, self.num_experts_per_tok)
-        #         if labels is not None:
-        #             loss += self.router_aux_loss_coef * aux_loss
-
-        #     if not return_dict:
-        #         output = (logits,) + outputs[1:]
-        #         if output_router_logits:
-        #             output = (aux_loss,) + output
-        #         return (loss,) + output if loss is not None else output
-
-        #     return MoeCausalLMOutputWithPast(
-        #         loss=loss,
-        #         aux_loss=aux_loss,
-        #         logits=logits,
-        #         past_key_values=None,
-        #         hidden_states=outputs[0],
-        #         attentions=None,
-        #         router_logits=outputs[-1],
-        #     )
-        # else:
-        #     out = {}
-        #     hidden_states = outputs.get("hidden_states")
-        #     out["hidden_states"] = hidden_states
-        #     if output_router_logits:
-        #         out["past_router_logits"] = outputs["past_router_logits"]
-        #     return out
+                    output = (aux_loss,) + output
+                return (loss,) + output if loss is not None else output
+
+            return MoeCausalLMOutputWithPast(
+                loss=loss,
+                aux_loss=aux_loss,
+                logits=logits,
+                past_key_values=None,
+                hidden_states=outputs[0],
+                attentions=None,
+                router_logits=outputs[-1],
+            )
+        else:
+            out = {}
+            hidden_states = outputs.get("hidden_states")
+            out["hidden_states"] = hidden_states
+            if output_router_logits:
+                out["past_router_logits"] = outputs["past_router_logits"]
+            return out
 
 
 def get_mixtral_flash_attention_forward(shard_config, sp_mode=None, sp_size=None, sp_group=None):
diff --git a/colossalai/shardformer/policies/mixtral.py b/colossalai/shardformer/policies/mixtral.py
index a8cd49dc17de..9d8d2b54b32c 100644
--- a/colossalai/shardformer/policies/mixtral.py
+++ b/colossalai/shardformer/policies/mixtral.py
@@ -343,18 +343,10 @@ def get_held_layers(self) -> List[Module]:
         """Get pipeline layers for current stage."""
         stage_manager = self.pipeline_stage_manager
         held_layers = super().get_held_layers()
-        if stage_manager.is_interleave:
-            if stage_manager.use_zbv:
-                if stage_manager.is_first_stage(ignore_chunk=True):
-                    held_layers.append(self.model.lm_head)
-            else:
-                if stage_manager.is_last_stage(ignore_chunk=True):
-                    held_layers.append(self.model.lm_head)
-        else:
-            if stage_manager.is_last_stage():
-                held_layers.append(self.model.lm_head)
-        # if stage_manager.is_last_stage():
-        #     held_layers.append(self.model.lm_head)
+        if stage_manager.use_zbv and stage_manager.is_first_stage(ignore_chunk=True):
+            held_layers.append(self.model.lm_head)
+        elif stage_manager.is_last_stage(ignore_chunk=True):
+            held_layers.append(self.model.lm_head)
         return held_layers
 
     def get_shared_params(self) -> List[Dict[int, Tensor]]:
diff --git a/examples/language/llama/benchmark.py b/examples/language/llama/benchmark.py
index 0e88fabf1eb0..0f418edb628b 100644
--- a/examples/language/llama/benchmark.py
+++ b/examples/language/llama/benchmark.py
@@ -21,6 +21,7 @@
 from colossalai.cluster import DistCoordinator
 from colossalai.lazy import LazyInitContext
 from colossalai.nn.optimizer import HybridAdam
+from colossalai.pipeline.schedule.v_schedule import PipelineGraph
 from colossalai.shardformer import PipelineGradientCheckpointConfig
 
 warnings.filterwarnings("ignore")
@@ -91,7 +92,7 @@ def main():
     parser.add_argument("--zero", type=int, default=0, help="Zero Stage when hybrid plugin is enabled")
     parser.add_argument("--custom-ckpt", action="store_true", help="Customize checkpoint", default=False)
 
-    parser.add_argument("--pp_style", default="1f1b", choices=["1f1b", "interleaved"])
+    parser.add_argument("--pp_style", default="1f1b", choices=["1f1b", "interleaved", "zbv"])
     parser.add_argument("--n_chunks", default=1, help="number of model chunks", type=eval)
     parser.add_argument("--profile", action="store_true", help="Profile the code")
     parser.add_argument(
@@ -137,6 +138,11 @@ def empty_init():
     # ==============================
     # Initialize Booster
     # ==============================
+    if args.config in MODEL_CONFIGS:
+        config = MODEL_CONFIGS[args.config]
+    else:
+        config = AutoConfig.from_pretrained(args.config, trust_remote_code=True)
+
     use_empty_init = True
     if args.plugin == "gemini":
         plugin = GeminiPlugin(
@@ -210,6 +216,23 @@ def empty_init():
                 fp8_communication=args.use_fp8_comm,
             )
     elif args.plugin == "3d":
+        if args.pp_style == "zbv":
+            mem_f = 34 * config.hidden_size + 5 * config.num_attention_heads * args.max_length
+            mem_w = -32 * config.hidden_size
+            mem_b = -mem_w - mem_f
+            scheduler_nodes = PipelineGraph(
+                n_stage=args.pp,
+                n_micro=args.batch_size // args.mbs,
+                f_cost=1000,
+                b_cost=1000,
+                w_cost=1000,
+                c_cost=1,
+                f_mem=mem_f,
+                b_mem=mem_b,
+                w_mem=mem_w,
+            ).get_v_schedule()
+        else:
+            scheduler_nodes = None
         plugin = HybridParallelPlugin(
             tp_size=args.tp,
             pp_size=args.pp,
@@ -227,6 +250,7 @@ def empty_init():
             overlap_allgather=args.overlap_allgather,
             use_fp8=args.use_fp8,
             fp8_communication=args.use_fp8_comm,
+            scheduler_nodes=scheduler_nodes,
             **hybrid_kwargs,
         )
     elif args.plugin == "3d_cpu":
@@ -256,10 +280,6 @@ def empty_init():
     # ==============================
     dp_size = getattr(plugin, "dp_size", coordinator.world_size)
 
-    if args.config in MODEL_CONFIGS:
-        config = MODEL_CONFIGS[args.config]
-    else:
-        config = AutoConfig.from_pretrained(args.config, trust_remote_code=True)
     torch.cuda.manual_seed(42)
     dataset = RandomDataset(
         num_samples=args.batch_size * args.num_steps * dp_size, max_length=args.max_length, vocab_size=config.vocab_size
@@ -334,8 +354,12 @@ def empty_init():
                     return_loss=True,
                 )
                 loss = outputs["loss"]
-                if dist.get_rank() == dist.get_world_size() - 1:
-                    print(f"Step {step} loss: {loss}")
+                if args.pp_style == "zbv":
+                    if dist.get_rank() == 0:
+                        print(f"Step {step} loss: {loss}")
+                else:
+                    if dist.get_rank() == dist.get_world_size() - 1:
+                        print(f"Step {step} loss: {loss}")
                 optimizer.step()
                 optimizer.zero_grad()
 
diff --git a/examples/language/mixtral/benchmark.py b/examples/language/mixtral/benchmark.py
index 2685afcedd6a..0334bd81c2ea 100644
--- a/examples/language/mixtral/benchmark.py
+++ b/examples/language/mixtral/benchmark.py
@@ -227,7 +227,6 @@ def main():
     )
 
     optimizer = HybridAdam(model.parameters())
-    # optimizer = torch.optim.SGD(model.parameters(), lr=1)
     torch.set_default_dtype(torch.bfloat16)
     model, optimizer, _, dataloader, _ = booster.boost(model, optimizer, dataloader=dataloader)
 
@@ -258,8 +257,12 @@ def main():
                     return_loss=True,
                 )
                 loss = outputs["loss"]
-                if dist.get_rank() == dist.get_world_size() - 1:
-                    print(f"Step {step} loss: {loss}")
+                if args.pp_style == "zbv":
+                    if dist.get_rank() == 0:
+                        print(f"Step {step} loss: {loss}")
+                else:
+                    if dist.get_rank() == dist.get_world_size() - 1:
+                        print(f"Step {step} loss: {loss}")
                 optimizer.step()
                 optimizer.zero_grad()
 

From 4e0e99bb6a356cb734194aa985d837e3744b0b92 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Fri, 11 Oct 2024 17:31:40 +0800
Subject: [PATCH 117/167] fix the test

---
 .../test_layer/test_ring_attn.py               | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/tests/test_shardformer/test_layer/test_ring_attn.py b/tests/test_shardformer/test_layer/test_ring_attn.py
index 1c7647a7d560..38dfcb23903b 100644
--- a/tests/test_shardformer/test_layer/test_ring_attn.py
+++ b/tests/test_shardformer/test_layer/test_ring_attn.py
@@ -5,6 +5,7 @@
 from torch.testing import assert_close
 
 import colossalai
+from colossalai.cluster import ProcessGroupMesh
 from colossalai.shardformer.layer import AttnMaskType
 from colossalai.shardformer.layer.attn import AttnMaskType, RingAttention
 from colossalai.shardformer.layer.utils import split_batch_zigzag, split_varlen_zigzag
@@ -17,11 +18,15 @@
 @parameterize("nheads", [5])
 @parameterize("d", [128])
 @parameterize("dtype", [torch.bfloat16, torch.float16])
-def check_ring_attn(seq_len, bs, nheads, d, dtype):
+def check_ring_attn(seq_len, bs, nheads, d, dtype, sp_size, tp_size=1):
     torch.cuda.manual_seed(2)
     device = get_current_device()
-    sp_group = dist.group.WORLD
-    sp_size = dist.get_world_size()
+    sp_axis, tp_axis = 0, 1
+    pg_mesh = ProcessGroupMesh(sp_size, tp_size)
+    tp_group = pg_mesh.get_group_along_axis(tp_axis)
+    sp_group = pg_mesh.get_group_along_axis(sp_axis)
+    # sp_group = dist.group.WORLD
+    # sp_size = dist.get_world_size()
     # Some outliers may seem large, but our errors are still lower than
     # than Megatron-LM context parallel's
     # (https://github.com/NVIDIA/TransformerEngine/blob/33a3d02f81c56e6f7b542c09bfa86657078d57fb/tests/pytorch/fused_attn/run_fused_attn_with_cp.py#L215)
@@ -44,6 +49,7 @@ def check_ring_attn(seq_len, bs, nheads, d, dtype):
         AttnMaskType.CAUSAL,
         return_softmax=True,
         inner_ring_size=max(2, sp_size // 2),
+        tp_group=tp_group,
         # inner_ring_size=4
     )
     ring_out = ring_out.transpose(1, 2)
@@ -161,12 +167,12 @@ def check_packed_seq(seqlen, bs, nheads, d, dtype):
 def launch_single_ring(rank, world_size, port):
     colossalai.launch(rank, world_size, "localhost", port)
     check_packed_seq()
-    check_ring_attn()
+    check_ring_attn(sp_size=world_size)
 
 
 def launch_double_ring(rank, world_size, port):
     colossalai.launch(rank, world_size, "localhost", port)
-    check_ring_attn()
+    check_ring_attn(sp_size=4, tp_size=2)
 
 
 @rerun_if_address_is_in_use()
@@ -176,7 +182,7 @@ def test_ring_attn(world_size):
 
 
 @rerun_if_address_is_in_use()
-@parameterize("world_size", [4])
+@parameterize("world_size", [8])
 def test_double_ring(world_size):
     spawn(launch_double_ring, nprocs=world_size)
 

From 703bb5c18dd3a701209017de828e9cdc8ca58356 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Fri, 11 Oct 2024 17:34:20 +0800
Subject: [PATCH 118/167] fix the test

---
 tests/test_shardformer/test_layer/test_ring_attn.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/test_shardformer/test_layer/test_ring_attn.py b/tests/test_shardformer/test_layer/test_ring_attn.py
index 38dfcb23903b..b9291a06199a 100644
--- a/tests/test_shardformer/test_layer/test_ring_attn.py
+++ b/tests/test_shardformer/test_layer/test_ring_attn.py
@@ -25,8 +25,6 @@ def check_ring_attn(seq_len, bs, nheads, d, dtype, sp_size, tp_size=1):
     pg_mesh = ProcessGroupMesh(sp_size, tp_size)
     tp_group = pg_mesh.get_group_along_axis(tp_axis)
     sp_group = pg_mesh.get_group_along_axis(sp_axis)
-    # sp_group = dist.group.WORLD
-    # sp_size = dist.get_world_size()
     # Some outliers may seem large, but our errors are still lower than
     # than Megatron-LM context parallel's
     # (https://github.com/NVIDIA/TransformerEngine/blob/33a3d02f81c56e6f7b542c09bfa86657078d57fb/tests/pytorch/fused_attn/run_fused_attn_with_cp.py#L215)
@@ -50,7 +48,6 @@ def check_ring_attn(seq_len, bs, nheads, d, dtype, sp_size, tp_size=1):
         return_softmax=True,
         inner_ring_size=max(2, sp_size // 2),
         tp_group=tp_group,
-        # inner_ring_size=4
     )
     ring_out = ring_out.transpose(1, 2)
     out, lse, _ = flash_attn_qkvpacked_func(

From 4c8e85ee0d1a030e6e09aa499eb27076a7b12e84 Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Fri, 11 Oct 2024 19:32:00 +0800
Subject: [PATCH 119/167] [Coati] Train DPO using PP (#6054)

* update dpo

* remove unsupport plugin

* update msg

* update dpo

* remove unsupport plugin

* update msg

* update template

* update dataset

* add pp for dpo

* update dpo

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add dpo fn

* update dpo

* update dpo

* update dpo

* update dpo

* minor update

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update loss

* update help

* polish code

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../ColossalChat/coati/models/loss.py         |   5 +-
 .../ColossalChat/coati/models/utils.py        |   4 +-
 .../ColossalChat/coati/trainer/dpo.py         | 658 +++++++++++++-----
 .../prepare_dataset.py                        |   3 +-
 .../examples/training_scripts/train_dpo.py    |  75 +-
 .../examples/training_scripts/train_sft.py    |   2 +-
 .../ColossalChat/tests/test_templating.sh     |  21 +-
 colossalai/shardformer/modeling/llama.py      |   1 +
 8 files changed, 531 insertions(+), 238 deletions(-)

diff --git a/applications/ColossalChat/coati/models/loss.py b/applications/ColossalChat/coati/models/loss.py
index bd0bbd36b9bc..927dfd5a89b6 100755
--- a/applications/ColossalChat/coati/models/loss.py
+++ b/applications/ColossalChat/coati/models/loss.py
@@ -153,10 +153,11 @@ def forward(
         else:
             # If no reference model is provided
             ref_logratios = 0.0
+
         pi_logratios = logprob_actor_chosen.sum(-1) - logprob_actor_reject.sum(-1)
         logits = pi_logratios - ref_logratios - self.gamma / self.beta
         losses = -torch.nn.functional.logsigmoid(self.beta * logits)
-
+        loss = losses.mean()
         # Calculate rewards for logging
         if logprob_ref_chosen is not None:
             chosen_rewards = self.beta * (logprob_actor_chosen.sum(-1) - logprob_ref_chosen.sum(-1)).detach()
@@ -167,7 +168,7 @@ def forward(
         else:
             rejected_rewards = self.beta * logprob_actor_reject.sum(-1).detach()
 
-        return losses, chosen_rewards, rejected_rewards
+        return loss, chosen_rewards, rejected_rewards
 
 
 class LogSigLoss(nn.Module):
diff --git a/applications/ColossalChat/coati/models/utils.py b/applications/ColossalChat/coati/models/utils.py
index c583f057a5ab..fe7ab209822f 100755
--- a/applications/ColossalChat/coati/models/utils.py
+++ b/applications/ColossalChat/coati/models/utils.py
@@ -50,8 +50,8 @@ def _log_probs_from_logits(logits: torch.Tensor, labels: torch.Tensor) -> torch.
         torch.Tensor: The log probabilities corresponding to the labels.
     """
     log_probs = F.log_softmax(logits, dim=-1)
-    log_probs_labels = log_probs.gather(dim=-1, index=labels.unsqueeze(-1))
-    return log_probs_labels.squeeze(-1)
+    per_label_logps = log_probs.gather(dim=-1, index=labels.unsqueeze(-1))
+    return per_label_logps.squeeze(-1)
 
 
 def calc_action_log_probs(logits: torch.Tensor, sequences: torch.LongTensor, num_actions: int) -> torch.Tensor:
diff --git a/applications/ColossalChat/coati/trainer/dpo.py b/applications/ColossalChat/coati/trainer/dpo.py
index faa7a90d92de..499113e9646f 100755
--- a/applications/ColossalChat/coati/trainer/dpo.py
+++ b/applications/ColossalChat/coati/trainer/dpo.py
@@ -6,6 +6,7 @@
 from typing import Any, Optional
 
 import torch
+import torch.distributed as dist
 from coati.models.loss import DpoLoss
 from coati.models.utils import calc_masked_log_probs
 from coati.trainer.utils import all_reduce_mean
@@ -13,10 +14,11 @@
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
 from torch.utils.data import DataLoader
-from tqdm import trange
+from tqdm import tqdm, trange
 from transformers import PreTrainedTokenizerBase
 
 from colossalai.booster import Booster, Plugin
+from colossalai.booster.plugin import HybridParallelPlugin
 from colossalai.cluster import DistCoordinator
 from colossalai.utils import get_current_device
 
@@ -96,18 +98,25 @@ def _before_fit(
         self.train_dataloader = train_preference_dataloader
         self.eval_dataloader = eval_preference_dataloader
         self.writer = None
-        if use_wandb and is_rank_0():
+
+        init_criterion = (
+            dist.get_rank() == dist.get_world_size() - 1
+            if isinstance(self.plugin, HybridParallelPlugin) and self.plugin.pp_size > 1
+            else is_rank_0()
+        )
+
+        if use_wandb and init_criterion:
             assert log_dir is not None, "log_dir must be provided when use_wandb is True"
             import wandb
 
             self.wandb_run = wandb.init(project="Coati-dpo", sync_tensorboard=True)
-        if log_dir is not None and is_rank_0():
+        if log_dir is not None and init_criterion:
             import os
             import time
 
             from torch.utils.tensorboard import SummaryWriter
 
-            log_dir = os.path.join(log_dir, "dpo")
+            log_dir = os.path.join(log_dir, "DPO")
             log_dir = os.path.join(log_dir, time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()))
             self.writer = SummaryWriter(log_dir=log_dir)
 
@@ -117,166 +126,147 @@ def _train(self, epoch: int):
             epoch int: the number of current epoch
         """
         self.model.train()
-        self.accumulative_meter.reset()
-        step_bar = trange(
-            len(self.train_dataloader) // self.accumulation_steps,
-            desc=f"Epoch {epoch + 1}/{self.max_epochs}",
-            disable=not is_rank_0(),
-        )
-        for i, batch in enumerate(self.train_dataloader):
-            batch = to_device(batch, self.device)
-            (
-                chosen_input_ids,
-                chosen_attention_mask,
-                chosen_loss_mask,
-                reject_input_ids,
-                reject_attention_mask,
-                reject_loss_mask,
-            ) = (
-                batch["chosen_input_ids"],
-                batch["chosen_attention_mask"],
-                batch["chosen_loss_mask"],
-                batch["reject_input_ids"],
-                batch["reject_attention_mask"],
-                batch["reject_loss_mask"],
-            )
-            if not self.apply_loss_mask:
-                chosen_loss_mask = chosen_loss_mask.fill_(1.0)
-                reject_loss_mask = reject_loss_mask.fill_(1.0)
-
-            batch_size = chosen_input_ids.size()[0]
-
-            actor_all_logits = self.model(
-                input_ids=torch.cat([chosen_input_ids, reject_input_ids]),
-                attention_mask=torch.cat([chosen_attention_mask, reject_attention_mask]),
-            )["logits"]
-            actor_chosen_logits = actor_all_logits[:batch_size]
-            actor_reject_logits = actor_all_logits[batch_size:]
-            logprob_actor_chosen = calc_masked_log_probs(
-                actor_chosen_logits, chosen_input_ids, chosen_loss_mask[:, 1:], self.length_normalization
-            )
-
-            logprob_actor_reject = calc_masked_log_probs(
-                actor_reject_logits, reject_input_ids, reject_loss_mask[:, 1:], self.length_normalization
+        if isinstance(self.plugin, HybridParallelPlugin) and self.plugin.pp_size > 1:
+            step_bar = tqdm(
+                range(len(self.train_dataloader)),
+                desc="Step",
+                disable=not (dist.get_rank() == dist.get_world_size() - 1),
             )
-
-            if self.ref_model is not None:
-                self.ref_model.eval()
-                with torch.no_grad():
-                    ref_all_logits = self.ref_model(
-                        input_ids=torch.cat([chosen_input_ids, reject_input_ids]),
-                        attention_mask=torch.cat([chosen_attention_mask, reject_attention_mask]),
-                    )["logits"]
-                    ref_chosen_logits = ref_all_logits[:batch_size]
-                    ref_reject_logits = ref_all_logits[batch_size:]
-                    logprob_ref_chosen = calc_masked_log_probs(
-                        ref_chosen_logits, chosen_input_ids, chosen_loss_mask[:, 1:], self.length_normalization
-                    )
-                    logprob_ref_reject = calc_masked_log_probs(
-                        ref_reject_logits, reject_input_ids, reject_loss_mask[:, 1:], self.length_normalization
+            for i, batch in enumerate(self.train_dataloader):
+                batch = to_device(batch, self.device)
+                (
+                    chosen_input_ids,
+                    chosen_attention_mask,
+                    chosen_loss_mask,
+                    reject_input_ids,
+                    reject_attention_mask,
+                    reject_loss_mask,
+                ) = (
+                    batch["chosen_input_ids"],
+                    batch["chosen_attention_mask"],
+                    batch["chosen_loss_mask"],
+                    batch["reject_input_ids"],
+                    batch["reject_attention_mask"],
+                    batch["reject_loss_mask"],
+                )
+                batch_size = chosen_input_ids.size()[0]
+                # Calculate logits from reference model.
+                if self.ref_model is not None:
+                    self.ref_model.eval()
+                    with torch.no_grad():
+                        ref_all_logits = self.ref_model(
+                            input_ids=torch.cat([chosen_input_ids, reject_input_ids]),
+                            attention_mask=torch.cat([chosen_attention_mask, reject_attention_mask]),
+                        )["logits"]
+                        ref_chosen_logits = ref_all_logits[:batch_size]
+                        ref_reject_logits = ref_all_logits[batch_size:]
+                        logprob_ref_chosen = calc_masked_log_probs(
+                            ref_chosen_logits, chosen_input_ids, chosen_loss_mask[:, 1:], self.length_normalization
+                        )
+                        logprob_ref_reject = calc_masked_log_probs(
+                            ref_reject_logits, reject_input_ids, reject_loss_mask[:, 1:], self.length_normalization
+                        )
+                else:
+                    logprob_ref_chosen = None
+                    logprob_ref_reject = None
+
+                # Merge chosen and reject
+                inputs_ids = torch.stack([item for tup in zip(chosen_input_ids, reject_input_ids) for item in tup])
+                attention_mask = torch.stack(
+                    [item for tup in zip(chosen_attention_mask, reject_attention_mask) for item in tup]
+                )
+                loss_mask = torch.stack([item for tup in zip(chosen_loss_mask, reject_loss_mask) for item in tup])
+                logprob_ref = torch.stack([item for tup in zip(logprob_ref_chosen, logprob_ref_reject) for item in tup])
+
+                data_iter = iter(
+                    [
+                        {
+                            "input_ids": inputs_ids,
+                            "attention_mask": attention_mask,
+                            "loss_mask": loss_mask,
+                            "logprob_ref": logprob_ref,
+                        }
+                    ]
+                )
+                rewards = []
+
+                def _criterion(outputs, inputs):
+                    loss, chosen_rewards, rejected_rewards = self.actor_loss_fn(
+                        calc_masked_log_probs(
+                            outputs["logits"][0::2],
+                            inputs["input_ids"][0::2],
+                            inputs["loss_mask"][0::2][:, 1:],
+                            self.length_normalization,
+                        ),
+                        calc_masked_log_probs(
+                            outputs["logits"][1::2],
+                            inputs["input_ids"][1::2],
+                            inputs["loss_mask"][1::2][:, 1:],
+                            self.length_normalization,
+                        ),
+                        inputs["logprob_ref"][0::2] if inputs["logprob_ref"] is not None else None,
+                        inputs["logprob_ref"][1::2] if inputs["logprob_ref"] is not None else None,
+                        inputs["loss_mask"][0::2][:, 1:],
+                        inputs["loss_mask"][1::2][:, 1:],
                     )
-            else:
-                logprob_ref_chosen = None
-                logprob_ref_reject = None
-
-            losses, chosen_rewards, rejected_rewards = self.actor_loss_fn(
-                logprob_actor_chosen,
-                logprob_actor_reject,
-                logprob_ref_chosen if logprob_ref_chosen is not None else None,
-                logprob_ref_reject if logprob_ref_reject is not None else None,
-                chosen_loss_mask[:, 1:],
-                reject_loss_mask[:, 1:],
-            )
-            reward_accuracies = (chosen_rewards > rejected_rewards).float().mean()
-
-            # DPO Loss
-            loss = losses.mean()
+                    rewards.append(chosen_rewards)
+                    rewards.append(rejected_rewards)
+                    return loss
+
+                outputs = self.booster.execute_pipeline(
+                    data_iter,
+                    self.model,
+                    criterion=_criterion,
+                    optimizer=self.optimizer,
+                    return_loss=True,
+                )
+                loss = outputs["loss"]
+                if self.booster.plugin.stage_manager.is_last_stage():
+                    chosen_rewards, rejected_rewards = rewards[0], rewards[1]
+                    global_loss = all_reduce_mean(loss, self.plugin)
+                    if dist.get_rank() == dist.get_world_size() - 1:
+                        step_bar.set_postfix(
+                            {
+                                "train/loss": global_loss.item(),
+                                "train/lr": self.actor_scheduler.get_last_lr()[0],
+                                "train/chosen_rewards": chosen_rewards.to(torch.float16).mean().item(),
+                                "train/rejected_rewards": rejected_rewards.to(torch.float16).mean().item(),
+                            }
+                        )
+                        step_bar.update()
+                        self.accumulative_meter.add("loss", global_loss.item())
+                        self.accumulative_meter.add("chosen_rewards", chosen_rewards.to(torch.float16).mean().item())
+                        self.accumulative_meter.add(
+                            "rejected_rewards", rejected_rewards.to(torch.float16).mean().item()
+                        )
+                        if self.writer is not None:
+                            self.writer.add_scalar("train/loss", self.accumulative_meter.get("loss"), i)
+                            self.writer.add_scalar(
+                                "train/chosen_rewards", self.accumulative_meter.get("chosen_rewards"), i
+                            )
+                            self.writer.add_scalar(
+                                "train/rejected_rewards",
+                                self.accumulative_meter.get("rejected_rewards"),
+                                i,
+                            )
+                            self.writer.add_scalar(
+                                "train/margin",
+                                self.accumulative_meter.get("chosen_rewards")
+                                - self.accumulative_meter.get("rejected_rewards"),
+                                i,
+                            )
 
-            self.booster.backward(loss=loss, optimizer=self.optimizer)
-            if self.num_train_step % self.accumulation_steps == self.accumulation_steps - 1:
                 self.optimizer.step()
                 self.optimizer.zero_grad()
                 self.actor_scheduler.step()
-
-            # sync
-            loss_mean = all_reduce_mean(tensor=loss)
-            chosen_rewards_mean = all_reduce_mean(tensor=chosen_rewards)
-            rejected_rewards_mean = all_reduce_mean(tensor=rejected_rewards)
-            reward_accuracies_mean = all_reduce_mean(tensor=reward_accuracies)
-            self.accumulative_meter.add("chosen_rewards", chosen_rewards_mean.to(torch.float16).mean().item())
-            self.accumulative_meter.add("rejected_rewards", rejected_rewards_mean.to(torch.float16).mean().item())
-            self.accumulative_meter.add("loss", loss_mean.to(torch.float16).item())
-            self.accumulative_meter.add("accuracy", reward_accuracies_mean.to(torch.float16).item())
-
-            if i % self.accumulation_steps == self.accumulation_steps - 1:
-                self.num_train_step += 1
-                step_bar.update()
-                # logging
-                if self.writer and is_rank_0():
-                    self.writer.add_scalar("train/loss", self.accumulative_meter.get("loss"), self.num_train_step)
-                    self.writer.add_scalar("train/lr", self.optimizer.param_groups[0]["lr"], self.num_train_step)
-                    self.writer.add_scalar(
-                        "train/chosen_rewards", self.accumulative_meter.get("chosen_rewards"), self.num_train_step
-                    )
-                    self.writer.add_scalar(
-                        "train/rejected_rewards",
-                        self.accumulative_meter.get("rejected_rewards"),
-                        self.num_train_step,
-                    )
-                    self.writer.add_scalar(
-                        "train/margin",
-                        self.accumulative_meter.get("chosen_rewards") - self.accumulative_meter.get("rejected_rewards"),
-                        self.num_train_step,
-                    )
-                    self.writer.add_scalar(
-                        "train/accuracy",
-                        self.accumulative_meter.get("accuracy"),
-                        self.num_train_step,
-                    )
-                self.accumulative_meter.reset()
-
-                if self.save_dir is not None and (self.num_train_step + 1) % self.save_interval == 0:
-                    # save checkpoint
-                    self.coordinator.print_on_master("\nStart saving model checkpoint with running states")
-                    save_checkpoint(
-                        save_dir=self.save_dir,
-                        booster=self.booster,
-                        model=self.model,
-                        optimizer=self.optimizer,
-                        lr_scheduler=self.actor_scheduler,
-                        epoch=epoch,
-                        step=i + 1,
-                        batch_size=batch_size,
-                        coordinator=self.coordinator,
-                    )
-                    self.coordinator.print_on_master(
-                        f"Saved checkpoint at epoch {epoch} step {self.save_interval} at folder {self.save_dir}"
-                    )
-
-        step_bar.close()
-
-    def _eval(self, epoch: int):
-        """
-        Args:
-            epoch int: the number of current epoch
-        """
-        if self.eval_dataloader is None:
-            self.coordinator.print_on_master("No eval dataloader is provided, skip evaluation")
-            return
-        self.model.eval()
-        self.ref_model.eval()
-        self.coordinator.print_on_master("\nStart evaluation...")
-
-        step_bar = trange(
-            len(self.eval_dataloader),
-            desc=f"Epoch {epoch + 1}/{self.max_epochs}",
-            disable=not is_rank_0(),
-        )
-
-        self.accumulative_meter.reset()
-
-        with torch.no_grad():
-            for i, batch in enumerate(self.eval_dataloader):
+        else:
+            self.accumulative_meter.reset()
+            step_bar = trange(
+                len(self.train_dataloader) // self.accumulation_steps,
+                desc=f"Epoch {epoch + 1}/{self.max_epochs}",
+                disable=not is_rank_0(),
+            )
+            for i, batch in enumerate(self.train_dataloader):
                 batch = to_device(batch, self.device)
                 (
                     chosen_input_ids,
@@ -300,12 +290,11 @@ def _eval(self, epoch: int):
                 batch_size = chosen_input_ids.size()[0]
 
                 actor_all_logits = self.model(
-                    torch.cat([chosen_input_ids, reject_input_ids]),
-                    torch.cat([chosen_attention_mask, reject_attention_mask]),
+                    input_ids=torch.cat([chosen_input_ids, reject_input_ids]),
+                    attention_mask=torch.cat([chosen_attention_mask, reject_attention_mask]),
                 )["logits"]
                 actor_chosen_logits = actor_all_logits[:batch_size]
                 actor_reject_logits = actor_all_logits[batch_size:]
-
                 logprob_actor_chosen = calc_masked_log_probs(
                     actor_chosen_logits, chosen_input_ids, chosen_loss_mask[:, 1:], self.length_normalization
                 )
@@ -314,22 +303,26 @@ def _eval(self, epoch: int):
                     actor_reject_logits, reject_input_ids, reject_loss_mask[:, 1:], self.length_normalization
                 )
 
-                self.ref_model.eval()
-
-                ref_all_logits = self.ref_model(
-                    torch.cat([chosen_input_ids, reject_input_ids]),
-                    torch.cat([chosen_attention_mask, reject_attention_mask]),
-                )["logits"]
-                ref_chosen_logits = ref_all_logits[:batch_size]
-                ref_reject_logits = ref_all_logits[batch_size:]
-                logprob_ref_chosen = calc_masked_log_probs(
-                    ref_chosen_logits, chosen_input_ids, chosen_loss_mask[:, 1:], self.length_normalization
-                )
-                logprob_ref_reject = calc_masked_log_probs(
-                    ref_reject_logits, reject_input_ids, reject_loss_mask[:, 1:], self.length_normalization
-                )
-
-                losses, chosen_rewards, rejected_rewards = self.actor_loss_fn(
+                if self.ref_model is not None:
+                    self.ref_model.eval()
+                    with torch.no_grad():
+                        ref_all_logits = self.ref_model(
+                            input_ids=torch.cat([chosen_input_ids, reject_input_ids]),
+                            attention_mask=torch.cat([chosen_attention_mask, reject_attention_mask]),
+                        )["logits"]
+                        ref_chosen_logits = ref_all_logits[:batch_size]
+                        ref_reject_logits = ref_all_logits[batch_size:]
+                        logprob_ref_chosen = calc_masked_log_probs(
+                            ref_chosen_logits, chosen_input_ids, chosen_loss_mask[:, 1:], self.length_normalization
+                        )
+                        logprob_ref_reject = calc_masked_log_probs(
+                            ref_reject_logits, reject_input_ids, reject_loss_mask[:, 1:], self.length_normalization
+                        )
+                else:
+                    logprob_ref_chosen = None
+                    logprob_ref_reject = None
+
+                loss, chosen_rewards, rejected_rewards = self.actor_loss_fn(
                     logprob_actor_chosen,
                     logprob_actor_reject,
                     logprob_ref_chosen if logprob_ref_chosen is not None else None,
@@ -338,7 +331,9 @@ def _eval(self, epoch: int):
                     reject_loss_mask[:, 1:],
                 )
                 reward_accuracies = (chosen_rewards > rejected_rewards).float().mean()
-                loss = losses.mean()
+
+                self.booster.backward(loss=loss, optimizer=self.optimizer)
+                # sync
                 loss_mean = all_reduce_mean(tensor=loss)
                 chosen_rewards_mean = all_reduce_mean(tensor=chosen_rewards)
                 rejected_rewards_mean = all_reduce_mean(tensor=rejected_rewards)
@@ -347,16 +342,301 @@ def _eval(self, epoch: int):
                 self.accumulative_meter.add("rejected_rewards", rejected_rewards_mean.to(torch.float16).mean().item())
                 self.accumulative_meter.add("loss", loss_mean.to(torch.float16).item())
                 self.accumulative_meter.add("accuracy", reward_accuracies_mean.to(torch.float16).item())
-                self.accumulative_meter.add(
-                    "margin", (chosen_rewards_mean - rejected_rewards_mean).to(torch.float16).mean().item()
+
+                if (i + 1) % self.accumulation_steps == 0:
+                    self.optimizer.step()
+                    self.optimizer.zero_grad()
+                    self.actor_scheduler.step()
+
+                    step_bar.set_postfix(
+                        {
+                            "train/loss": self.accumulative_meter.get("loss"),
+                            "train/chosen_rewards": self.accumulative_meter.get("chosen_rewards"),
+                            "train/rejected_rewards": self.accumulative_meter.get("rejected_rewards"),
+                            "train/accuracy": self.accumulative_meter.get("accuracy"),
+                        }
+                    )
+                    step_bar.update()
+                    if self.writer and is_rank_0():
+                        self.writer.add_scalar("train/loss", self.accumulative_meter.get("loss"), self.num_train_step)
+                        self.writer.add_scalar("train/lr", self.optimizer.param_groups[0]["lr"], self.num_train_step)
+                        self.writer.add_scalar(
+                            "train/chosen_rewards", self.accumulative_meter.get("chosen_rewards"), self.num_train_step
+                        )
+                        self.writer.add_scalar(
+                            "train/rejected_rewards",
+                            self.accumulative_meter.get("rejected_rewards"),
+                            self.num_train_step,
+                        )
+                        self.writer.add_scalar(
+                            "train/margin",
+                            self.accumulative_meter.get("chosen_rewards")
+                            - self.accumulative_meter.get("rejected_rewards"),
+                            self.num_train_step,
+                        )
+                        self.writer.add_scalar(
+                            "train/accuracy",
+                            self.accumulative_meter.get("accuracy"),
+                            self.num_train_step,
+                        )
+                    self.num_train_step += 1
+                    self.accumulative_meter.reset()
+
+            if self.save_dir is not None and self.num_train_step > 0 and self.num_train_step % self.save_interval == 0:
+                # save checkpoint
+                self.coordinator.print_on_master("\nStart saving model checkpoint with running states")
+                save_checkpoint(
+                    save_dir=self.save_dir,
+                    booster=self.booster,
+                    model=self.model,
+                    optimizer=self.optimizer,
+                    lr_scheduler=self.actor_scheduler,
+                    epoch=epoch,
+                    step=self.num_train_step,
+                    batch_size=batch_size,
+                    coordinator=self.coordinator,
                 )
-                step_bar.update()
-
-        msg = "Evaluation Result:\n"
-        for tag in ["loss", "chosen_rewards", "rejected_rewards", "accuracy", "margin"]:
-            msg = msg + f"{tag}: {self.accumulative_meter.get(tag)}\n"
-        self.coordinator.print_on_master(msg)
-        os.makedirs(self.save_dir, exist_ok=True)
-        with open(os.path.join(self.save_dir, f"eval_result_epoch{epoch}.txt"), "w") as f:
-            f.write(msg)
+                self.coordinator.print_on_master(
+                    f"Saved checkpoint at epoch {epoch} step {self.save_interval} at folder {self.save_dir}"
+                )
+
+        step_bar.close()
+
+    def _eval(self, epoch: int):
+        """
+        Args:
+            epoch int: the number of current epoch
+        """
+        if self.eval_dataloader is None:
+            self.coordinator.print_on_master("No eval dataloader is provided, skip evaluation")
+            return
+        self.model.eval()
+        self.ref_model.eval()
+        self.accumulative_meter.reset()
+        self.coordinator.print_on_master("\nStart evaluation...")
+
+        if isinstance(self.plugin, HybridParallelPlugin) and self.plugin.pp_size > 1:
+            step_bar = tqdm(
+                range(len(self.eval_dataloader)),
+                desc="Step",
+                disable=not (dist.get_rank() == dist.get_world_size() - 1),
+            )
+            with torch.no_grad():
+                for _, batch in enumerate(self.eval_dataloader):
+                    batch = to_device(batch, self.device)
+                    (
+                        chosen_input_ids,
+                        chosen_attention_mask,
+                        chosen_loss_mask,
+                        reject_input_ids,
+                        reject_attention_mask,
+                        reject_loss_mask,
+                    ) = (
+                        batch["chosen_input_ids"],
+                        batch["chosen_attention_mask"],
+                        batch["chosen_loss_mask"],
+                        batch["reject_input_ids"],
+                        batch["reject_attention_mask"],
+                        batch["reject_loss_mask"],
+                    )
+                    batch_size = chosen_input_ids.size()[0]
+                    # Calculate logits from reference model.
+                    if self.ref_model is not None:
+                        self.ref_model.eval()
+                        with torch.no_grad():
+                            ref_all_logits = self.ref_model(
+                                input_ids=torch.cat([chosen_input_ids, reject_input_ids]),
+                                attention_mask=torch.cat([chosen_attention_mask, reject_attention_mask]),
+                            )["logits"]
+                            ref_chosen_logits = ref_all_logits[:batch_size]
+                            ref_reject_logits = ref_all_logits[batch_size:]
+                            logprob_ref_chosen = calc_masked_log_probs(
+                                ref_chosen_logits, chosen_input_ids, chosen_loss_mask[:, 1:], self.length_normalization
+                            )
+                            logprob_ref_reject = calc_masked_log_probs(
+                                ref_reject_logits, reject_input_ids, reject_loss_mask[:, 1:], self.length_normalization
+                            )
+                    else:
+                        logprob_ref_chosen = None
+                        logprob_ref_reject = None
+
+                    # Merge chosen and reject
+                    inputs_ids = torch.stack([item for tup in zip(chosen_input_ids, reject_input_ids) for item in tup])
+                    attention_mask = torch.stack(
+                        [item for tup in zip(chosen_attention_mask, reject_attention_mask) for item in tup]
+                    )
+                    loss_mask = torch.stack([item for tup in zip(chosen_loss_mask, reject_loss_mask) for item in tup])
+                    logprob_ref = torch.stack(
+                        [item for tup in zip(logprob_ref_chosen, logprob_ref_reject) for item in tup]
+                    )
+
+                    data_iter = iter(
+                        [
+                            {
+                                "input_ids": inputs_ids,
+                                "attention_mask": attention_mask,
+                                "loss_mask": loss_mask,
+                                "logprob_ref": logprob_ref,
+                            }
+                        ]
+                    )
+                    rewards = []
+
+                    def _criterion(outputs, inputs):
+                        loss, chosen_rewards, rejected_rewards = self.actor_loss_fn(
+                            calc_masked_log_probs(
+                                outputs["logits"][0::2],
+                                inputs["input_ids"][0::2],
+                                inputs["loss_mask"][0::2][:, 1:],
+                                self.length_normalization,
+                            ),
+                            calc_masked_log_probs(
+                                outputs["logits"][1::2],
+                                inputs["input_ids"][1::2],
+                                inputs["loss_mask"][1::2][:, 1:],
+                                self.length_normalization,
+                            ),
+                            inputs["logprob_ref"][0::2] if inputs["logprob_ref"] is not None else None,
+                            inputs["logprob_ref"][1::2] if inputs["logprob_ref"] is not None else None,
+                            inputs["loss_mask"][0::2][:, 1:],
+                            inputs["loss_mask"][1::2][:, 1:],
+                        )
+                        rewards.append(chosen_rewards)
+                        rewards.append(rejected_rewards)
+                        return loss
+
+                    outputs = self.booster.execute_pipeline(
+                        data_iter,
+                        self.model,
+                        criterion=_criterion,
+                        optimizer=self.optimizer,
+                        return_loss=True,
+                    )
+                    loss = outputs["loss"]
+                    if self.booster.plugin.stage_manager.is_last_stage():
+                        chosen_rewards, rejected_rewards = rewards[0], rewards[1]
+                        global_loss = all_reduce_mean(loss, self.plugin)
+                        chosen_rewards_mean = all_reduce_mean(chosen_rewards, self.plugin)
+                        rejected_rewards_mean = all_reduce_mean(rejected_rewards, self.plugin)
+                        if dist.get_rank() == dist.get_world_size() - 1:
+                            step_bar.set_postfix(
+                                {
+                                    "eval/loss": global_loss.item(),
+                                    "eval/lr": self.actor_scheduler.get_last_lr()[0],
+                                    "eval/chosen_rewards": chosen_rewards.to(torch.float16).mean().item(),
+                                    "eval/rejected_rewards": rejected_rewards.to(torch.float16).mean().item(),
+                                }
+                            )
+                            self.accumulative_meter.add(
+                                "chosen_rewards", chosen_rewards_mean.to(torch.float16).mean().item()
+                            )
+                            self.accumulative_meter.add(
+                                "rejected_rewards", rejected_rewards_mean.to(torch.float16).mean().item()
+                            )
+                            self.accumulative_meter.add("loss", global_loss.to(torch.float16).item())
+                            step_bar.update()
+                if self.booster.plugin.stage_manager.is_last_stage():
+                    msg = "\nEvaluation Result:\n"
+                    for tag in ["loss", "chosen_rewards", "rejected_rewards"]:
+                        msg = msg + f"{tag}: {self.accumulative_meter.get(tag)}\n"
+                    if dist.get_rank() == dist.get_world_size() - 1:
+                        print(msg)
+        else:
+            step_bar = trange(
+                len(self.eval_dataloader),
+                desc=f"Epoch {epoch + 1}/{self.max_epochs}",
+                disable=not is_rank_0(),
+            )
+            with torch.no_grad():
+                for i, batch in enumerate(self.eval_dataloader):
+                    batch = to_device(batch, self.device)
+                    (
+                        chosen_input_ids,
+                        chosen_attention_mask,
+                        chosen_loss_mask,
+                        reject_input_ids,
+                        reject_attention_mask,
+                        reject_loss_mask,
+                    ) = (
+                        batch["chosen_input_ids"],
+                        batch["chosen_attention_mask"],
+                        batch["chosen_loss_mask"],
+                        batch["reject_input_ids"],
+                        batch["reject_attention_mask"],
+                        batch["reject_loss_mask"],
+                    )
+                    if not self.apply_loss_mask:
+                        chosen_loss_mask = chosen_loss_mask.fill_(1.0)
+                        reject_loss_mask = reject_loss_mask.fill_(1.0)
+
+                    batch_size = chosen_input_ids.size()[0]
+
+                    actor_all_logits = self.model(
+                        torch.cat([chosen_input_ids, reject_input_ids]),
+                        torch.cat([chosen_attention_mask, reject_attention_mask]),
+                    )["logits"]
+                    actor_chosen_logits = actor_all_logits[:batch_size]
+                    actor_reject_logits = actor_all_logits[batch_size:]
+
+                    logprob_actor_chosen = calc_masked_log_probs(
+                        actor_chosen_logits, chosen_input_ids, chosen_loss_mask[:, 1:], self.length_normalization
+                    )
+
+                    logprob_actor_reject = calc_masked_log_probs(
+                        actor_reject_logits, reject_input_ids, reject_loss_mask[:, 1:], self.length_normalization
+                    )
+                    ref_all_logits = self.ref_model(
+                        torch.cat([chosen_input_ids, reject_input_ids]),
+                        torch.cat([chosen_attention_mask, reject_attention_mask]),
+                    )["logits"]
+                    ref_chosen_logits = ref_all_logits[:batch_size]
+                    ref_reject_logits = ref_all_logits[batch_size:]
+                    logprob_ref_chosen = calc_masked_log_probs(
+                        ref_chosen_logits, chosen_input_ids, chosen_loss_mask[:, 1:], self.length_normalization
+                    )
+                    logprob_ref_reject = calc_masked_log_probs(
+                        ref_reject_logits, reject_input_ids, reject_loss_mask[:, 1:], self.length_normalization
+                    )
+
+                    losses, chosen_rewards, rejected_rewards = self.actor_loss_fn(
+                        logprob_actor_chosen,
+                        logprob_actor_reject,
+                        logprob_ref_chosen if logprob_ref_chosen is not None else None,
+                        logprob_ref_reject if logprob_ref_reject is not None else None,
+                        chosen_loss_mask[:, 1:],
+                        reject_loss_mask[:, 1:],
+                    )
+                    reward_accuracies = (chosen_rewards > rejected_rewards).float().mean()
+                    loss = losses.mean()
+                    loss_mean = all_reduce_mean(tensor=loss)
+                    chosen_rewards_mean = all_reduce_mean(tensor=chosen_rewards)
+                    rejected_rewards_mean = all_reduce_mean(tensor=rejected_rewards)
+                    reward_accuracies_mean = all_reduce_mean(tensor=reward_accuracies)
+                    self.accumulative_meter.add("chosen_rewards", chosen_rewards_mean.to(torch.float16).mean().item())
+                    self.accumulative_meter.add(
+                        "rejected_rewards", rejected_rewards_mean.to(torch.float16).mean().item()
+                    )
+                    self.accumulative_meter.add("loss", loss_mean.to(torch.float16).item())
+                    self.accumulative_meter.add("accuracy", reward_accuracies_mean.to(torch.float16).item())
+                    self.accumulative_meter.add(
+                        "margin", (chosen_rewards_mean - rejected_rewards_mean).to(torch.float16).mean().item()
+                    )
+                    step_bar.set_postfix(
+                        {
+                            "eval/loss": self.accumulative_meter.get("loss"),
+                            "eval/chosen_rewards": self.accumulative_meter.get("chosen_rewards"),
+                            "eval/rejected_rewards": self.accumulative_meter.get("rejected_rewards"),
+                            "eval/accuracy": self.accumulative_meter.get("accuracy"),
+                        }
+                    )
+                    step_bar.update()
+
+            msg = "\nEvaluation Result:\n"
+            for tag in ["loss", "chosen_rewards", "rejected_rewards", "accuracy", "margin"]:
+                msg = msg + f"{tag}: {self.accumulative_meter.get(tag)}\n"
+            self.coordinator.print_on_master(msg)
+        if self.save_dir is not None:
+            os.makedirs(self.save_dir, exist_ok=True)
+            with open(os.path.join(self.save_dir, f"eval_result_epoch{epoch}.txt"), "w") as f:
+                f.write(msg)
         step_bar.close()
diff --git a/applications/ColossalChat/examples/data_preparation_scripts/prepare_dataset.py b/applications/ColossalChat/examples/data_preparation_scripts/prepare_dataset.py
index a35f2bf52dfd..b551497b9cce 100644
--- a/applications/ColossalChat/examples/data_preparation_scripts/prepare_dataset.py
+++ b/applications/ColossalChat/examples/data_preparation_scripts/prepare_dataset.py
@@ -73,8 +73,7 @@ def main():
         "--conversation_template_config",
         type=str,
         default="conversation_template_config",
-        help="Path \
-        to save conversation template config files.",
+        help="Path to save conversation template config files.",
     )
     parser.add_argument("--data_cache_dir", type=str, default="cache", help="Data cache directory")
     parser.add_argument(
diff --git a/applications/ColossalChat/examples/training_scripts/train_dpo.py b/applications/ColossalChat/examples/training_scripts/train_dpo.py
index 3b324ee784e0..ad81db73aa53 100755
--- a/applications/ColossalChat/examples/training_scripts/train_dpo.py
+++ b/applications/ColossalChat/examples/training_scripts/train_dpo.py
@@ -13,7 +13,7 @@
 
 import colossalai
 from colossalai.booster import Booster
-from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin
+from colossalai.booster.plugin import GeminiPlugin, HybridParallelPlugin, LowLevelZeroPlugin, TorchDDPPlugin
 from colossalai.cluster import DistCoordinator
 from colossalai.logging import get_dist_logger
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
@@ -29,8 +29,6 @@ def train(args):
     # check lora compatibility
     if "gemini" in args.plugin and lora_config is not None and lora_config.r > 0:
         raise ValueError("LoRA is not supported in GeminiPlugin. Please use other plugin")
-    if args.plugin == "gemini_auto" and args.accumulation_steps > 1:
-        raise ValueError("Gradient accumulation is not supported in GeminiPlugin. Please use other plugin")
 
     # ==============================
     # Initialize Distributed Training
@@ -46,7 +44,7 @@ def train(args):
         Default torch ddp plugin without any acceleration, for
         debugging purpose acceleration, for debugging purpose
         """
-        plugin = TorchDDPPlugin(find_unused_parameters=True)
+        plugin = TorchDDPPlugin(find_unused_parameters=not args.grad_checkpoint)
     elif args.plugin == "gemini":
         plugin = GeminiPlugin(
             precision=args.mixed_precision,
@@ -56,14 +54,6 @@ def train(args):
             enable_gradient_accumulation=True,
             enable_flash_attention=args.use_flash_attn,
         )
-    elif args.plugin == "gemini_auto":
-        plugin = GeminiPlugin(
-            precision=args.mixed_precision,
-            placement_policy="auto",
-            initial_scale=2**16,
-            max_norm=args.grad_clip,
-            enable_flash_attention=args.use_flash_attn,
-        )
     elif args.plugin == "zero2":
         plugin = LowLevelZeroPlugin(
             stage=2,
@@ -92,20 +82,24 @@ def train(args):
             parallel_output=False,
             max_norm=args.grad_clip,
             precision=args.mixed_precision,
+            microbatch_size=args.microbatch_size,
         )
     else:
         raise ValueError(f"Unknown plugin {args.plugin}")
 
     booster = Booster(plugin=plugin)
-    ref_booster = Booster(plugin=plugin)
 
-    # ======================================================
-    # Initialize Model, Objective, Optimizer and LR Scheduler
-    # ======================================================
-    # Temp Fix: Disable lazy init due to version conflict
-    # init_ctx = (
-    #     LazyInitContext(default_device=get_current_device()) if isinstance(plugin, (GeminiPlugin,)) else nullcontext()
-    # )
+    ref_plugin = HybridParallelPlugin(
+        tp_size=args.ref_tp,
+        pp_size=1,
+        zero_stage=args.zero_stage,
+        enable_flash_attention=args.use_flash_attn,
+        cpu_offload=True if args.zero_stage >= 1 and args.zero_cpu_offload else False,
+        parallel_output=False,
+        max_norm=args.grad_clip,
+        precision=args.mixed_precision,
+    )
+    ref_booster = Booster(plugin=ref_plugin)
 
     init_ctx = nullcontext()
     with init_ctx:
@@ -130,6 +124,7 @@ def train(args):
                 ref_model = AutoModelForCausalLM.from_pretrained(args.pretrain)
         else:
             ref_model = None
+
         if args.lora_config is not None:
             model = convert_to_lora_module(model, lora_config=lora_config)
             for name, module in model.named_modules():
@@ -139,7 +134,9 @@ def train(args):
         disable_dropout(ref_model)
 
     if args.grad_checkpoint:
-        # Note, for some models, lora may not be compatible with gradient checkpointing
+        # Make sure gradient checkpointing can be activated.
+        model.train()
+        # Note, for some models, lora may not be compatible with gradient checkpointing.
         model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
         coordinator.print_on_master(msg="Gradient checkpointing enabled successfully")
 
@@ -169,7 +166,7 @@ def train(args):
         adamw_mode=True,
     )
 
-    # configure dataset
+    # Configure dataset
     coordinator.print_on_master(f"Load dataset: {args.dataset}")
     mode_map = {"train": "train", "valid": "validation", "test": "test"}
     train_dataset = load_tokenized_dataset(dataset_paths=args.dataset, mode="train", mode_map=mode_map)
@@ -213,14 +210,15 @@ def train(args):
 
     default_dtype = torch.float16 if args.mixed_precision == "fp16" else torch.bfloat16
     torch.set_default_dtype(default_dtype)
+
     model, optim, _, train_dataloader, lr_scheduler = booster.boost(
         model=model,
         optimizer=optim,
         lr_scheduler=lr_scheduler,
         dataloader=train_dataloader,
     )
-    if ref_model is not None:
-        ref_model, _, _, _, _ = ref_booster.boost(model=ref_model, dataloader=train_dataloader)
+    ref_model, _, _, _, _ = ref_booster.boost(model=ref_model)
+
     torch.set_default_dtype(torch.float)
 
     coordinator.print_on_master(f"Booster init max CUDA memory: {torch.cuda.max_memory_allocated() / 1024 ** 2:.2f} MB")
@@ -312,7 +310,7 @@ def train(args):
         "--plugin",
         type=str,
         default="gemini",
-        choices=["gemini", "gemini_auto", "zero2", "zero2_cpu", "3d"],
+        choices=["gemini", "zero2", "zero2_cpu", "3d", "ddp"],
         help="Choose which plugin to use",
     )
     parser.add_argument("--grad_clip", type=float, default=1.0, help="Gradient clipping value")
@@ -342,22 +340,35 @@ def train(args):
     parser.add_argument("--max_length", type=int, default=2048, help="Model max length")
     parser.add_argument("--max_epochs", type=int, default=3)
     parser.add_argument("--batch_size", type=int, default=4)
-    parser.add_argument(
-        "--disable_reference_model",
-        action="store_true",
-        default=False,
-        help="Disable the reference model (enabled by default)",
-    )
     parser.add_argument("--disable_loss_mask", default=False, action="store_true")
     parser.add_argument("--mixed_precision", type=str, default="fp16", choices=["fp16", "bf16"], help="Mixed precision")
     parser.add_argument("--lora_config", type=str, default=None, help="low-rank adaptation config file path")
     parser.add_argument("--save_interval", type=int, default=1000, help="number of step between two checkpoints")
     parser.add_argument("--lr", type=float, default=5e-6)
-    parser.add_argument("--accumulation_steps", type=int, default=8)
+    parser.add_argument("--accumulation_steps", type=int, default=1)
     parser.add_argument("--log_dir", default=None, type=str)
     parser.add_argument("--use_wandb", default=False, action="store_true")
     parser.add_argument("--grad_checkpoint", default=False, action="store_true")
     parser.add_argument("--use_flash_attn", default=False, action="store_true")
+    parser.add_argument(
+        "--microbatch_size",
+        type=int,
+        default=2,
+        help="Micro batch size for PP training. To activate PP training for DPO-like algorithm, you must keep size even and the size should be equal or greater than 2.",
+    )
+    # Parameter for reference model
+    parser.add_argument(
+        "--disable_reference_model",
+        action="store_true",
+        default=False,
+        help="Disable the reference model (enabled by default)",
+    )
+    parser.add_argument(
+        "--ref_tp",
+        type=int,
+        default=1,
+        help="TP size for reference model; used only when reference model is too large.",
+    )
     args = parser.parse_args()
 
     # fool proof hyperparameter setup
diff --git a/applications/ColossalChat/examples/training_scripts/train_sft.py b/applications/ColossalChat/examples/training_scripts/train_sft.py
index 62acad32f66a..e319340c3f60 100755
--- a/applications/ColossalChat/examples/training_scripts/train_sft.py
+++ b/applications/ColossalChat/examples/training_scripts/train_sft.py
@@ -68,7 +68,7 @@ def train(args):
         Default torch ddp plugin without any acceleration, for
         debugging purpose acceleration, for debugging purpose
         """
-        plugin = TorchDDPPlugin(find_unused_parameters=True if args.grad_checkpoint is False else False)
+        plugin = TorchDDPPlugin(find_unused_parameters=not args.grad_checkpoint)
     elif args.plugin == "gemini":
         plugin = GeminiPlugin(
             precision=args.mixed_precision,
diff --git a/applications/ColossalChat/tests/test_templating.sh b/applications/ColossalChat/tests/test_templating.sh
index 6ee10e8bed87..defe6f71bb4c 100755
--- a/applications/ColossalChat/tests/test_templating.sh
+++ b/applications/ColossalChat/tests/test_templating.sh
@@ -4,7 +4,7 @@ BASE_TEMP_DIR=$BASE_DIR/temp
 EXAMPLES_DIR=$BASE_DIR/examples
 TEST_DATA_DIR=$BASE_DIR/tests/test_data
 DATA_SAVE_PATH=$BASE_TEMP_DIR/tests
-CONFIG_DIR=$BASE_DIR/config
+CONFIG_DIR=$BASE_DIR/conversation_template
 
 # MODELS=("colossal-llama2" "llama2" "mistral" "chatGLM2" "chatGLM3" "deepseek" "Yi" "baichuan")  # for local test
 MODELS=("colossal-llama2" "llama2" "chatGLM2" "chatGLM3" "deepseek" "Yi")
@@ -39,23 +39,23 @@ get_pretrain() {
 get_conversation_template_config() {
     local model=$1
     if [[ $model == "colossal-llama2" ]]; then
-        echo "$CONFIG_DIR/conversation_template/colossal-llama2.json"
+        echo "$CONFIG_DIR/colossal-llama2.json"
     elif [[ $model == "llama2" ]]; then
-        echo "$CONFIG_DIR/conversation_template/llama2.json"
+        echo "$CONFIG_DIR/llama2.json"
     elif [[ $model == "deepseek" ]]; then
-        echo "$CONFIG_DIR/conversation_template/deepseek-ai_DeepSeek-V2-Lite.json"
+        echo "$CONFIG_DIR/deepseek-ai_DeepSeek-V2-Lite.json"
     elif [[ $model == "mistral" ]]; then
-        echo "$CONFIG_DIR/conversation_template/mistralai_Mixtral-8x7B-Instruct-v0.1.json"
+        echo "$CONFIG_DIR/mistralai_Mixtral-8x7B-Instruct-v0.1.json"
     elif [[ $model == "chatGLM2" ]]; then
-        echo "$CONFIG_DIR/conversation_template/THUDM_chatglm2-6b.json"
+        echo "$CONFIG_DIR/THUDM_chatglm2-6b.json"
     elif [[ $model == "chatGLM3" ]]; then
-        echo "$CONFIG_DIR/conversation_template/THUDM_chatglm3-6b.json"
+        echo "$CONFIG_DIR/THUDM_chatglm3-6b.json"
     elif [[ $model == "phi" ]]; then
-        echo "$CONFIG_DIR/conversation_template/microsoft_phi-2.json"
+        echo "$CONFIG_DIR/microsoft_phi-2.json"
     elif [[ $model == "Yi" ]]; then
-        echo "$CONFIG_DIR/conversation_template/01-ai_Yi-1.5-9B-Chat.json"
+        echo "$CONFIG_DIR/01-ai_Yi-1.5-9B-Chat.json"
     elif [[ $model == "baichuan" ]]; then
-        echo "$CONFIG_DIR/conversation_template/baichuan-inc_Baichuan2-13B-Chat.json"
+        echo "$CONFIG_DIR/baichuan-inc_Baichuan2-13B-Chat.json"
     else
         echo "Unknown model $model"
         exit 1
@@ -71,6 +71,7 @@ for model in ${MODELS[@]}; do
     rm -rf $SAVE_DIR/arrow
     pretrain=$(get_pretrain $model)
     conversation_template_config=$(get_conversation_template_config $model)
+    echo $conversation_template_config
     python $EXAMPLES_DIR/data_preparation_scripts/prepare_dataset.py --type sft --data_input_dirs $TEST_DATA_DIR/sft \
         --tokenizer_dir $pretrain \
         --conversation_template_config $conversation_template_config \
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index 47c17e7494f2..2a5b60287e32 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -271,6 +271,7 @@ def llama_for_causal_lm_forward(
         hidden_states: Optional[torch.FloatTensor] = None,
         stage_index: Optional[List[int]] = None,
         shard_config: ShardConfig = None,
+        **kwargs,
     ):
         r"""
         Args:

From e1e86f9f1fdc7a885d9183b19567e440e3e8b34f Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Mon, 14 Oct 2024 11:45:35 +0800
Subject: [PATCH 120/167] fix

---
 colossalai/shardformer/layer/attn.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 411b4dbcc83a..aa013e5263a0 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -484,12 +484,7 @@ def get_double_ring_groups(sp_group, tp_group, inner_ring_size=None):
                 start = i * num_ring_size
                 end = (i + 1) * num_ring_size
                 for idx in range(start, end):
-                    inner_rank = []
-                    for k in range(inner_ring_size):
-                        current_num = idx + k * tp_size
-                        if current_num >= end:
-                            break
-                        inner_rank.append(current_num)
+                    inner_rank = [idx + k * tp_size for k in range(inner_ring_size) if idx + k * tp_size < end]
                     if len(inner_rank) == inner_ring_size and inner_rank not in ranks:
                         ranks.append(inner_rank)
                         group = dist.new_group(inner_rank)

From d891e50617b07bc09f3213069b857ba8c0325d2c Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Mon, 14 Oct 2024 14:56:05 +0800
Subject: [PATCH 121/167] fix

---
 colossalai/shardformer/layer/attn.py | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index aa013e5263a0..36a239142d62 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -445,18 +445,11 @@ def get_double_ring_groups(sp_group, tp_group, inner_ring_size=None):
         tp_size = dist.get_world_size(tp_group) if tp_group is not None else 1
         sp_rank = dist.get_rank(sp_group)
 
-        if inner_ring_size is None:
-            if torch.cuda.device_count() >= dist.get_world_size():
-                # single node, no need to consider NICs
-                return sp_group, sp_group
-            if sp_size <= 4:
-                inner_ring_size = min(2, sp_size)
-            else:
-                inner_ring_size = min(4, sp_size)
-        else:
-            assert (
-                inner_ring_size <= sp_size and sp_size % inner_ring_size == 0
-            ), f"Error: sp_size {sp_size} should be divisible by inner_ring_size {inner_ring_size}"
+        assert inner_ring_size is not None
+
+        assert (
+            inner_ring_size <= sp_size and sp_size % inner_ring_size == 0
+        ), f"Error: sp_size {sp_size} should be divisible by inner_ring_size {inner_ring_size}"
 
         if inner_ring_size == sp_size:
             return sp_group, sp_group
@@ -575,7 +568,7 @@ def attention(
 
         clone_pg = lambda pg: dist.new_group(dist.get_process_group_ranks(pg))
 
-        if RingAttention.SP_GROUP is not sp_group:
+        if inner_ring_size != None:
             RingAttention.SP_GROUP = sp_group
             inner_ring_group, inter_ring_group = RingAttention.get_double_ring_groups(
                 sp_group, tp_group, inner_ring_size

From cfade4c36d1d0eda9793faf15ca49a214ddb51c0 Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Mon, 14 Oct 2024 07:02:43 +0000
Subject: [PATCH 122/167] [feat] Linear1D_COL/ROW support zbv WeightGradStore;

---
 .../pipeline/schedule/zero_bubble_pp.py       |  42 +-
 colossalai/pipeline/weight_grad_store.py      | 106 +++
 colossalai/shardformer/layer/_operation.py    |  69 +-
 colossalai/shardformer/layer/linear.py        |   1 -
 examples/language/llama/benchmark.py          |   3 +
 .../test_schedule/test_zerobubble_pp.py       |   1 +
 tests/test_pipeline/test_schedule/zbv_poc.py  | 628 ++++++++++++++++++
 7 files changed, 821 insertions(+), 29 deletions(-)
 create mode 100644 colossalai/pipeline/weight_grad_store.py
 create mode 100644 tests/test_pipeline/test_schedule/zbv_poc.py

diff --git a/colossalai/pipeline/schedule/zero_bubble_pp.py b/colossalai/pipeline/schedule/zero_bubble_pp.py
index c928a207c405..089ca48eeb60 100644
--- a/colossalai/pipeline/schedule/zero_bubble_pp.py
+++ b/colossalai/pipeline/schedule/zero_bubble_pp.py
@@ -11,6 +11,7 @@
 from colossalai.pipeline.p2p import PipelineP2PCommunication
 from colossalai.pipeline.schedule.v_schedule import ScheduledNode
 from colossalai.pipeline.stage_manager import PipelineStageManager
+from colossalai.pipeline.weight_grad_store import WeightGradStore
 
 from ._utils import (
     clone,
@@ -650,10 +651,10 @@ def schedule_f(
         # Do not release_tensor_data loss, release_tensor_data other output_obj;
         if model_chunk_id == 1 and self.stage_manager.is_first_stage(ignore_chunk=True):
             self.output_tensors[model_chunk_id].append(output_obj)
-            self.output_tensors_dw[model_chunk_id].append(output_obj)
+            # self.output_tensors_dw[model_chunk_id].append(output_obj)
         else:
             self.output_tensors[model_chunk_id].append(output_obj)
-            self.output_tensors_dw[model_chunk_id].append(output_obj)
+            # self.output_tensors_dw[model_chunk_id].append(output_obj)
 
         # add output to send_fwd_buffer
         if model_chunk_id == 0:  # chunk 0
@@ -705,13 +706,13 @@ def schedule_b(
         input_obj = self.input_tensors[model_chunk_id].pop(0)
         output_obj = self.output_tensors[model_chunk_id].pop(0)
 
-        # save output_tensor_grad for dw
-        if model_chunk_id == 1 and self.stage_manager.is_first_stage(ignore_chunk=True):
-            # we save loss here
-            self.output_tensors_grad_dw[model_chunk_id].append(output_obj)
-        else:
-            # we save output_tensor_grad here
-            self.output_tensors_grad_dw[model_chunk_id].append(output_tensor_grad)
+        # # save output_tensor_grad for dw
+        # if model_chunk_id == 1 and self.stage_manager.is_first_stage(ignore_chunk=True):
+        #     # we save loss here
+        #     self.output_tensors_grad_dw[model_chunk_id].append(output_obj)
+        # else:
+        #     # we save output_tensor_grad here
+        #     self.output_tensors_grad_dw[model_chunk_id].append(output_tensor_grad)
 
         # Step2: bwd step
         input_object_grad = self.backward_b_step(
@@ -738,6 +739,7 @@ def schedule_b(
             # send to next
             else:
                 self.send_backward_buffer[model_chunk_id].append(input_object_grad)
+        WeightGradStore.flush(chunk=model_chunk_id)
 
     def schedule_w(
         self,
@@ -757,16 +759,18 @@ def schedule_w(
         """
 
         # get y & dy from buffer
-        output_obj = self.output_tensors_dw[model_chunk_id].pop(0)
-        output_obj_grad = self.output_tensors_grad_dw[model_chunk_id].pop(0)
-
-        self.backward_w_step(
-            model_chunk=model_chunk,
-            model_chunk_id=model_chunk_id,
-            optimizer=optimizer,
-            output_obj=output_obj,
-            output_obj_grad=output_obj_grad,
-        )
+        # output_obj = self.output_tensors_dw[model_chunk_id].pop(0)
+        # output_obj_grad = self.output_tensors_grad_dw[model_chunk_id].pop(0)
+
+        WeightGradStore.pop(chunk=model_chunk_id)
+
+        # self.backward_w_step(
+        #     model_chunk=model_chunk,
+        #     model_chunk_id=model_chunk_id,
+        #     optimizer=optimizer,
+        #     output_obj=output_obj,
+        #     output_obj_grad=output_obj_grad,
+        # )
 
     def run_forward_only(
         self,
diff --git a/colossalai/pipeline/weight_grad_store.py b/colossalai/pipeline/weight_grad_store.py
new file mode 100644
index 000000000000..5d7f76649483
--- /dev/null
+++ b/colossalai/pipeline/weight_grad_store.py
@@ -0,0 +1,106 @@
+import queue
+
+# from megatron import get_args
+# from megatron.core import parallel_state
+# from megatron.core.distributed.finalize_model_grads import _allreduce_embedding_grads
+# from megatron.core.utils import get_model_config, get_attr_wrapped_model
+
+
+class WeightGradStore:
+
+    cache = []
+    weight_grad_queue = [queue.Queue(), queue.Queue()]
+
+    @classmethod
+    def put(cls, total_input, grad_output, weight, func):
+        # func(total_input, grad_output, weight.main_grad)
+        cls.cache.append((total_input, grad_output, weight, func))
+
+    @classmethod
+    def flush(cls, chunk=0):
+        cls.weight_grad_queue[chunk].put(cls.cache)
+        cls.cache = []
+
+    @classmethod
+    def pop(cls, chunk=0):
+        if cls.weight_grad_queue[chunk].qsize() > 0:
+            stored_grads = cls.weight_grad_queue[chunk].get()
+            for total_input, grad_output, weight, func in stored_grads:
+                if weight.grad is not None:
+                    func(total_input, grad_output, weight.grad)
+                # for first bwd; weight.grad is None, assign grad_weight to weight.grad
+                else:
+                    grad_weight = func(total_input, grad_output)
+                    weight.grad = grad_weight
+        else:
+            raise Exception("Pop empty queue.")
+
+    # @classmethod
+    # def clear(cls, model, chunk=0):
+    #     weight_grad_tasks = []
+    #     while cls.weight_grad_queue[chunk].qsize() > 0:
+    #         stored_grads = cls.weight_grad_queue[chunk].get()
+    #         if len(weight_grad_tasks) == 0:
+    #             for _ in stored_grads:
+    #                 weight_grad_tasks.append([])
+    #         else:
+    #             assert len(weight_grad_tasks) == len(stored_grads)
+    #         for i, task in enumerate(stored_grads):
+    #             weight_grad_tasks[i].append(task)
+    #     weight_params = []
+    #     handles = []
+    #     if get_args().overlap_grad_reduce:
+    #         handles += model.async_reduce_grad()
+
+    #     output_layer_weight = None
+    #     if parallel_state.is_pipeline_last_stage():
+    #         assert len(weight_grad_tasks) > 0
+    #         output_layer_grads = weight_grad_tasks[0]
+    #         for j in range(len(output_layer_grads)):
+    #             total_input, grad_output, weight, func = output_layer_grads[j]
+    #             if output_layer_weight is None:
+    #                 output_layer_weight = weight
+    #             assert output_layer_weight is weight
+    #             func(total_input, grad_output, weight.main_grad)
+    #             output_layer_grads[j] = None  # release memory
+    #         weight_grad_tasks = weight_grad_tasks[1:]
+    #         if get_args().overlap_grad_reduce:
+    #             handles += model.async_reduce_grad(output_layer_weight)
+
+    #     if parallel_state.is_pipeline_first_stage() or parallel_state.is_pipeline_last_stage():
+    #         model_module = get_attr_wrapped_model(model, 'pre_process', return_model_obj=True)
+    #         if model_module.share_embeddings_and_output_weights:
+    #             # if share_embeddings_and_output_weights, wait all-reduce for embeddings
+    #             for handle in handles:
+    #                 if handle is not None:
+    #                     handle.wait()
+    #             handles = []
+
+    #     config = get_model_config(model)
+    #     # Do async all-reduce for embedding grads firstly, so that the rank 0 won't
+    #     # be blocked
+    #     embedding_handles = _allreduce_embedding_grads([model], config, async_op=True)
+    #     handles += embedding_handles
+
+    #     for i in range(len(weight_grad_tasks)):
+    #         tasks = weight_grad_tasks[i]
+    #         param = None
+    #         for j in range(len(tasks)):
+    #             total_input, grad_output, weight, func = tasks[j]
+    #             if param is None:
+    #                 param = weight
+    #             assert param is weight
+    #             assert not (weight is output_layer_weight)
+    #             func(total_input, grad_output, weight.main_grad)
+    #             tasks[j] = None  # release memory
+    #         weight_params.append(param)
+    #         if get_args().overlap_grad_reduce:
+    #             # All-reduce param grad here
+    #             handles += model.async_reduce_grad(param)
+    #         weight_grad_tasks[i] = None  # release memory
+
+    #     # timers('wait_all_reduce', log_level=1).start(barrier=False)
+    #     for handle in embedding_handles:
+    #         if handle is not None:
+    #             handle.wait()
+    #     # timers('wait_all_reduce').stop()
diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index aec82356747a..626a009ec430 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -1,7 +1,11 @@
+import functools
+
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
 
+from colossalai.pipeline.weight_grad_store import WeightGradStore
+
 from .utils import is_share_sp_tp
 
 try:
@@ -125,12 +129,13 @@ class LinearWithAsyncCommunication(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, input_, weight, bias, process_group, async_grad_allreduce, fp8_communication=False):
+    def forward(ctx, input_, weight, bias, process_group, async_grad_allreduce, fp8_communication=False, use_zbv=True):
         ctx.save_for_backward(input_, weight, bias)
         ctx.use_bias = bias is not None
         ctx.process_group = process_group
         ctx.async_grad_allreduce = async_grad_allreduce
         ctx.fp8_communication = fp8_communication
+        ctx.use_zbv = use_zbv
         if bias is not None:
             output = F.linear(input_, weight, bias)
         else:
@@ -143,6 +148,14 @@ def backward(ctx, grad_output):
         input, weight, bias = ctx.saved_tensors
         use_bias = ctx.use_bias
         fp8_communication = ctx.fp8_communication
+        use_zbv = ctx.use_zbv
+
+        def execute_w_pass_grad_accum(_input_, _grad_output_, _weight_main_grad_, wgrad_gemm_accum_func=None):
+            wgrad_gemm_accum_func(_input_, _grad_output_, _weight_main_grad_)
+
+        def execute_w_pass(_input_, _grad_output_, _weight_main_grad_=None, wgrad_gemm_func=None):
+            # _grad_output_.t().matmul(_input_)
+            return wgrad_gemm_func(_grad_output_.t(), _input_)
 
         # In order to be hooked into Gemini's '__torch_function__', adding a view operation to bias.
         if use_bias:
@@ -167,22 +180,60 @@ def backward(ctx, grad_output):
 
         if _grad_accum_fusion_available and weight.grad is not None:
             grad = weight.grad
-            if grad.dtype == torch.float32:
-                fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(total_input, grad_output, grad)
-                grad_weight = None
-            elif grad.dtype == torch.float16:
-                fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16(total_input, grad_output, grad)
+            if use_zbv:
+                # TODO: append input, grad_output_, weight, grad func to WeightGradStore
+                if grad.dtype == torch.float32:
+                    WeightGradStore.put(
+                        total_input,
+                        grad_output,
+                        weight,
+                        functools.partial(
+                            execute_w_pass_grad_accum,
+                            wgrad_gemm_accum_func=fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32,
+                        ),
+                    )
+                    grad_weight = None
+                elif grad.dtype in (torch.float16, torch.bfloat16):
+                    WeightGradStore.put(
+                        total_input,
+                        grad_output,
+                        weight,
+                        functools.partial(
+                            execute_w_pass_grad_accum,
+                            wgrad_gemm_accum_func=fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16,
+                        ),
+                    )
+                    grad_weight = None
+                else:
+                    raise RuntimeError("Unsupported gradient type for gradient accumulation fusion")
+            else:
+                if grad.dtype == torch.float32:
+                    fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(total_input, grad_output, grad)
+                    grad_weight = None
+                elif grad.dtype == torch.float16:
+                    fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16(total_input, grad_output, grad)
+                    grad_weight = None
+                else:
+                    grad_weight = grad_output.t().matmul(total_input)
+        else:
+            if use_zbv:
+                WeightGradStore.put(
+                    total_input,
+                    grad_output,
+                    weight,
+                    functools.partial(
+                        execute_w_pass,
+                        wgrad_gemm_func=torch.matmul,
+                    ),
+                )
                 grad_weight = None
             else:
                 grad_weight = grad_output.t().matmul(total_input)
-        else:
-            grad_weight = grad_output.t().matmul(total_input)
 
         grad_bias = grad_output.sum(dim=0) if use_bias else None
 
         if ctx.async_grad_allreduce and not fp8_communication:
             handle.wait()
-
         return grad_input, grad_weight, grad_bias, None, None, None, None
 
 
diff --git a/colossalai/shardformer/layer/linear.py b/colossalai/shardformer/layer/linear.py
index d77dd496592f..25f4228a4e62 100644
--- a/colossalai/shardformer/layer/linear.py
+++ b/colossalai/shardformer/layer/linear.py
@@ -201,7 +201,6 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
 
         # Matrix multiply.
         bias = self.bias if not self.skip_bias_add else None
-
         if self.seq_parallel_mode == "split_gather":
             input_parallel = gather_forward_reducescatter_backward(
                 input_parallel, self.process_group, self.seq_parallel_dim, fp8_communication=self.fp8_communication
diff --git a/examples/language/llama/benchmark.py b/examples/language/llama/benchmark.py
index 0f418edb628b..4f2c45d75ba8 100644
--- a/examples/language/llama/benchmark.py
+++ b/examples/language/llama/benchmark.py
@@ -5,6 +5,8 @@
 from contextlib import nullcontext
 
 import torch
+
+torch.autograd.set_detect_anomaly(True)
 import torch.distributed as dist
 from data_utils import RandomDataset
 from model_utils import format_numel_str, get_model_numel
@@ -251,6 +253,7 @@ def empty_init():
             use_fp8=args.use_fp8,
             fp8_communication=args.use_fp8_comm,
             scheduler_nodes=scheduler_nodes,
+            make_vocab_size_divisible_by=1,
             **hybrid_kwargs,
         )
     elif args.plugin == "3d_cpu":
diff --git a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
index 1e8f1392e470..4225da802d78 100644
--- a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
+++ b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
@@ -926,5 +926,6 @@ def test_pp():
     )
 
 
+# python -m pytest -s tests/test_pipeline/test_schedule/test_zerobubble_pp.py
 if __name__ == "__main__":
     test_pp()
diff --git a/tests/test_pipeline/test_schedule/zbv_poc.py b/tests/test_pipeline/test_schedule/zbv_poc.py
new file mode 100644
index 000000000000..6280990a9edd
--- /dev/null
+++ b/tests/test_pipeline/test_schedule/zbv_poc.py
@@ -0,0 +1,628 @@
+import gc
+import time
+from copy import deepcopy
+
+import torch
+import torch.nn as nn
+from torch.testing import assert_close
+
+
+def get_model_numel(model):
+    return sum(p.numel() for p in model.parameters()) / 1024**2
+
+
+# Step1: dx = w*dy
+def backward_b(loss, x, model):
+    torch.autograd.backward(loss, inputs=x, retain_graph=True)
+
+
+# Step2: dummy dw = x*dy
+def backward_w(loss, model):
+    torch.autograd.backward(loss, inputs=list(model.parameters()))
+
+
+def test_double_dx_dw_split_nsync():
+    device = "cuda:0"
+    model = nn.Linear(4096, 4096, bias=None).to(device=device)
+    # print(f"model numel {get_model_numel(model)}") # 4GB
+    x1 = torch.rand(4096, 4096).to(device=device)
+    x2 = torch.rand(4096, 4096).to(device=device)
+    ref_model = deepcopy(model)
+    ref_x1 = x1.clone()
+    ref_x2 = x1.clone()
+
+    # first step
+    x1.requires_grad_()
+    x2.requires_grad_()
+    ref_x1.requires_grad_()
+    ref_x2.requires_grad_()
+
+    # loss for dx_dw bwd
+    loss1 = model(x1).sum()
+    loss2 = model(x2).sum()
+
+    # loss for common bwd
+    ref_loss1 = ref_model(ref_x1).sum()
+    ref_loss2 = ref_model(ref_x2).sum()
+
+    # dx1
+    torch.cuda.synchronize()
+    bwd_b_start_time = time.time()
+    backward_b(loss1, x1, model)
+    bwd_b_end_time = time.time()
+    print(f"loss_1 bwd B runtime {bwd_b_end_time - bwd_b_start_time}")
+
+    for p in model.parameters():
+        assert p.grad is None
+    assert x1.grad is not None
+
+    # dx2
+    torch.cuda.synchronize()
+    bwd_b_start_time = time.time()
+    backward_b(loss2, x2, model)
+    bwd_b_end_time = time.time()
+    print(f"loss_2 bwd B runtime {bwd_b_end_time - bwd_b_start_time}")
+
+    # dw1
+    torch.cuda.synchronize()
+    bwd_w_start_time = time.time()
+    backward_w(loss1, model)
+    bwd_w_end_time = time.time()
+    print(f"loss_1 bwd W runtime {bwd_w_end_time - bwd_w_start_time}")
+    for p in model.parameters():
+        assert p.grad is not None
+
+    # common bwd 1
+    torch.cuda.synchronize()
+    comm_bwd_start_time = time.time()
+    ref_loss1.backward()
+    comm_bwd_end_time = time.time()
+    print(f"loss_1 comm bwd runtime {comm_bwd_end_time - comm_bwd_start_time}")
+
+    # # assert dx1 & dw1 == bwd 1
+    # assert_close(x1.grad, ref_x1.grad)
+    # for p1, p2 in zip(model.parameters(), ref_model.parameters()):
+    #     assert_close(p1, p2)
+    #     assert_close(p1.grad, p2.grad)
+
+    # dw2
+    torch.cuda.synchronize()
+    bwd_w_start_time = time.time()
+    backward_w(loss2, model)
+    bwd_w_end_time = time.time()
+    print(f"loss_2 bwd W runtime {bwd_w_end_time - bwd_w_start_time}")
+
+    # common bwd 2
+    torch.cuda.synchronize()
+    comm_bwd_start_time = time.time()
+    ref_loss2.backward()
+    comm_bwd_end_time = time.time()
+    print(f"loss_2 comm bwd runtime {comm_bwd_end_time - comm_bwd_start_time}")
+
+    # # assert dx2 & dw2 == bwd 2
+    # assert_close(x2.grad, ref_x2.grad)
+    # for p1, p2 in zip(model.parameters(), ref_model.parameters()):
+    #     print(f"bwd2:\n p1 {p1.grad},\n p2 {p2.grad}\n")
+    #     assert_close(p1, p2)
+    #     assert_close(p1.grad, p2.grad)
+
+
+def test_double_dx_dw_split_sync():
+    device = "cuda:0"
+    model = nn.Linear(8, 8, bias=None).to(device=device)
+    print(f"model size {get_model_numel(model)} ")  # 4GB
+    x1 = torch.rand(8, 8).to(device=device)
+    x2 = torch.rand(8, 8).to(device=device)
+
+    # x1 = torch.ones(8, 8).to(device=device)
+    # x2 = torch.ones(8, 8).to(device=device)
+
+    ref_model = deepcopy(model)
+    ref_x1 = x1.clone()
+    ref_x2 = x2.clone()
+
+    x1.requires_grad_()
+    x2.requires_grad_()
+    ref_x1.requires_grad_()
+    ref_x2.requires_grad_()
+
+    ############
+    # step1:
+    ############
+
+    # loss1
+    loss1 = model(x1).sum()
+
+    # ref_loss1
+    ref_model(ref_x1).sum()
+
+    # dx1
+    backward_b(loss1, x1, model)
+    for p in model.parameters():
+        assert p.grad is None
+    assert x1.grad is not None
+
+    # dw1
+    backward_w(loss1, model)
+    for p in model.parameters():
+        assert p.grad is not None
+
+    # common bwd 1
+    # ref_loss1.backward()
+
+    # assert dx1 & dw1 == bwd 1
+    assert_close(x1.grad, ref_x1.grad)
+    for p1, p2 in zip(model.parameters(), ref_model.parameters()):
+        assert_close(p1, p2)
+        assert_close(p1.grad, p2.grad)
+
+    ############
+    # step2:
+    ############
+
+    # loss2
+    loss2 = model(x2).sum()
+
+    # ref_loss2
+    ref_loss2 = ref_model(ref_x2).sum()
+
+    for p1, p2 in zip(model.parameters(), ref_model.parameters()):
+        print(f"bwd2:\n p1 {p1.grad},\n p2 {p2.grad}\n")
+        assert_close(p1, p2)
+        assert_close(p1.grad, p2.grad)
+
+    # dx2
+    backward_b(loss2, x2, model)
+
+    # dw2
+    backward_w(loss2, model)
+
+    # common bwd 2
+    ref_loss2.backward()
+
+    # assert dx2 & dw2 == bwd 2
+    assert_close(x2.grad, ref_x2.grad)
+    for p1, p2 in zip(model.parameters(), ref_model.parameters()):
+        print(f"bwd2:\n p1 {p1.grad},\n p2 {p2.grad}\n")
+        assert_close(p1, p2)
+        assert_close(p1.grad, p2.grad)
+
+
+def deallocate_output_tensor(out):
+    """Pseudo-deallocate (i.e., set to scalar) the output tensor's '.data' field.
+
+    This method should be called right after the output tensor has been
+    sent to the next pipeline stage. At this point, the output tensor is
+    only useful for its '.grad_fn' field, and not its '.data'.
+    """
+    assert isinstance(out, torch.Tensor), "expected Tensor, found %s." % type(out).__name__
+    assert out._base is None, "counter-productive to free a view of another tensor."
+    out.data = torch.empty(
+        (1,),
+        device=out.device,
+        dtype=out.dtype,
+    )
+
+
+IN_DIM = 8192
+OUT_DIM = 8192
+NUM_LAYER = 3
+
+
+class MlpModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.layers = nn.ModuleList([nn.Linear(IN_DIM, OUT_DIM, bias=None) for _ in range(NUM_LAYER)])
+
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.0, proj_drop=0.0, with_qkv=True):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.with_qkv = with_qkv
+        if self.with_qkv:
+            self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+            self.proj = nn.Linear(dim, dim)
+            self.proj_drop = nn.Dropout(proj_drop)
+        self.attn_drop = nn.Dropout(attn_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        if self.with_qkv:
+            qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+            q, k, v = qkv[0], qkv[1], qkv[2]
+        else:
+            qkv = x.reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+            q, k, v = qkv, qkv, qkv
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        if self.with_qkv:
+            x = self.proj(x)
+            x = self.proj_drop(x)
+        return x
+
+
+def mem_dx_dw():
+    device = "cuda:0"
+    # model = nn.Linear(IN_DIM, OUT_DIM, bias=None).to(device=device)
+    print(f"Before init Model: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
+    model = MlpModel().to(device=device)
+    print(f"model numel {get_model_numel(model)}")  # 4GB
+    print(f"After init Model: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
+
+    print(f"Before init x1&2&3: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
+    x1 = torch.rand(IN_DIM, OUT_DIM).to(device=device)
+    x2 = torch.rand(IN_DIM, OUT_DIM).to(device=device)
+    x3 = torch.rand(IN_DIM, OUT_DIM).to(device=device)
+
+    x1.requires_grad_()
+    x2.requires_grad_()
+    x3.requires_grad_()
+    print(f"After init x1&2&3: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
+
+    ############
+    # step1:
+    ############
+    print(f"\nStep1")
+
+    # loss1
+    print(f"Before Fwd x1: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
+    y1 = model(x1)
+    print(f"After Fwd x1: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
+
+    print(f"Before loss1: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
+    loss1 = y1.sum()
+    print(f"After loss1: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
+
+    # dx1
+    backward_b(loss1, x1, model)
+
+    # dw1
+    backward_w(loss1, model)
+
+    deallocate_output_tensor(x1)
+    deallocate_output_tensor(y1)
+    # del x1
+    # del y1
+    print(f"After del x1&y1: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
+
+    # print(f"\n Step1:collect:{gc.collect()}")
+    # print(f"object: {gc.get_objects()}")
+    # print(f"garbage: {gc.garbage}")
+
+    ############
+    # step2:
+    ############
+    print(f"\nStep2")
+
+    # loss2
+    y2 = model(x2)
+    loss2 = y2.sum()
+
+    # dx2
+    backward_b(loss2, x2, model)
+
+    # dw2
+    backward_w(loss2, model)
+    deallocate_output_tensor(x2)
+    deallocate_output_tensor(y2)
+    # del x2
+    # del y2
+    print(f"After del x2&y2: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
+
+    print(f"\n Step2:collect:{gc.collect()}")
+    # print(f"object: {gc.get_objects()}")
+    print(f"garbage: {gc.garbage}")
+
+    ############
+    # step3:
+    ############
+
+    print(f"\nStep3")
+
+    # loss3
+    y3 = model(x3)
+    loss3 = y3.sum()
+
+    # dx2
+    backward_b(loss3, x3, model)
+
+    # dw2
+    backward_w(loss3, model)
+
+    deallocate_output_tensor(x3)
+    deallocate_output_tensor(y3)
+    # del x3
+    # del y3
+
+    print(f"After del x3&y3: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
+
+    print(f"\n Step3:collect:{gc.collect()}")
+    # print(f"object: {gc.get_objects()}")
+    print(f"garbage: {gc.garbage}")
+
+
+# del activation
+def activation_dx_dw():
+    device = "cuda:0"
+    # model = nn.Linear(IN_DIM, OUT_DIM, bias=None).to(device=device)
+    print(f"Before init Model: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
+    model = MlpModel().to(device=device)
+    x1 = torch.rand(IN_DIM, OUT_DIM).to(device=device)
+    x2 = torch.rand(IN_DIM, OUT_DIM).to(device=device)
+    x3 = torch.rand(IN_DIM, OUT_DIM).to(device=device)
+
+    x1.requires_grad_()
+    x2.requires_grad_()
+    x3.requires_grad_()
+    print(f"After init Model, x1,x2,x3: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
+
+    activations = {}
+
+    def register_hooks(module):
+        def activation_hook(module, input, output):
+            activations[f"{module.__class__.__name__}_{id(module)}"] = output.detach()
+
+        def bwd_hook(module, grad_input, grad_output):
+            del activations[f"{module.__class__.__name__}_{id(module)}"]
+
+        module.register_forward_hook(activation_hook)
+        module.register_backward_hook(bwd_hook)
+
+    model.apply(register_hooks)
+
+    ############
+    # step1:
+    ############
+    print(f"\nStep1")
+
+    # loss1
+    loss1 = model(x1).sum()
+
+    # dx1
+    backward_b(loss1, x1, model)
+
+    # dw1
+    backward_w(loss1, model)
+
+    del loss1, x1
+    print(f"After del x1&y1: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
+
+    ############
+    # step2:
+    ############
+    print(f"\nStep2")
+
+    # loss2
+    loss2 = model(x2).sum()
+
+    # dx2
+    backward_b(loss2, x2, model)
+
+    # dw2
+    backward_w(loss2, model)
+
+    # deallocate_output_tensor(x2)
+    # deallocate_output_tensor(loss2)
+    del x2, loss2
+    print(f"After del x2&y2: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
+
+    ############
+    # step3:
+    ############
+    print(f"\nStep3")
+
+    # loss3
+    loss3 = model(x3).sum()
+
+    # dx2
+    backward_b(loss3, x3, model)
+
+    # dw2
+    backward_w(loss3, model)
+
+    del x3, loss3
+
+    print(f"After del x3&y3: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
+
+
+# text dx dw in model chunk
+def model_chunk_dx_dw():
+    device = "cuda:0"
+    num_layers = 4
+    print(f"Before init Model: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
+    model = MlpModel(in_dim=4096, out_dim=4096, num_layers=num_layers).to(device=device)
+    x = torch.rand(4096, 4096).to(device=device)
+    x.requires_grad_()
+
+    model_chunk_0 = torch.nn.ModuleList()  # for layer 1 & 2
+    model_chunk_1 = torch.nn.ModuleList()  # for layer 3 & 4
+
+    for idx, sub_model in enumerate(model.layers):
+        if idx < 2:
+            model_chunk_0.append(sub_model).cuda()
+        else:
+            model_chunk_1.append(sub_model).cuda()
+
+    print(f"After init Model & input: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
+
+    # Step1:chunk 0 fwd
+    activation = dict()  # layer_id: activation
+    out = x
+    for i in range(len(model_chunk_0)):
+        layer = model_chunk_0[i]
+        activation[i] = layer(out)
+    print(f"After chunk0 fwd: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
+    # Step2:chunk 1 fwd
+    for i in range(len(model_chunk_1)):
+        layer = model_chunk_0[i]
+        activation[i + 2] = layer(out)
+    print(f"After chunk1 fwd: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
+
+    # Step3:chunk 1 bwd b: dx=w*dy & bwd w:dw=x*dy
+    # visit layer reversely
+    for i in range(len(model_chunk_1) - 1, -1, -1):
+        layer = model_chunk_1[i]
+        global_layer_idx = i + 2
+        prev_global_layer_idx = i + 1 if i + 1 > 0 else None
+        i + 3 if i + 3 < 4 else None
+
+        # bwd b
+        if global_layer_idx == num_layers - 1:  # last layer in last chunk; calculate loss
+            loss = activation[global_layer_idx].sum()
+            x = activation[prev_global_layer_idx]
+            backward_b(loss, x, layer)
+        else:
+            loss = activation[global_layer_idx].sum()
+            x = activation[prev_global_layer_idx]
+            backward_b(loss, x, layer)
+
+        # bwd w
+        backward_w(loss, layer)
+
+
+def test_dx_dw_linear_benchmark():
+    device = "cuda:0"
+    model = nn.Linear(4096, 4096, bias=None).to(device=device)
+    # print(f"model numel {get_model_numel(model)}") # 4GB
+    x1 = torch.rand(4096, 4096).to(device=device)
+    # x2 = torch.rand(4096, 4096).to(device=device)
+    ref_model = deepcopy(model)
+    ref_x1 = x1.clone()
+    # ref_x2 = x1.clone()
+
+    # first step
+    x1.requires_grad_()
+    # x2.requires_grad_()
+    ref_x1.requires_grad_()
+    # ref_x2.requires_grad_()
+
+    # loss for dx_dw bwd
+    loss1 = model(x1).sum()
+    # loss2 = model(x2).sum()
+
+    # loss for common bwd
+    ref_model(ref_x1).sum()
+    # ref_loss2 = ref_model(ref_x2).sum()
+
+    with torch.profiler.profile(
+        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
+        # schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
+        on_trace_ready=torch.profiler.tensorboard_trace_handler(
+            f"/home/nvme-share/home/duanjunwen/ColossalAI/tests/test_pipeline/test_schedule"
+        ),
+        record_shapes=True,
+        profile_memory=True,
+        with_stack=True,
+        with_flops=True,
+    ) as prof:
+        # dx1
+        torch.cuda.synchronize()
+        bwd_b_start_time = time.time()
+        backward_b(loss1, x1, model)
+        bwd_b_end_time = time.time()
+        print(f"loss_1 bwd B runtime {bwd_b_end_time - bwd_b_start_time}")
+
+        for p in model.parameters():
+            assert p.grad is None
+        assert x1.grad is not None
+
+        # dw1
+        torch.cuda.synchronize()
+        bwd_w_start_time = time.time()
+        backward_w(loss1, model)
+        bwd_w_end_time = time.time()
+        print(f"loss_1 bwd W runtime {bwd_w_end_time - bwd_w_start_time}")
+        for p in model.parameters():
+            assert p.grad is not None
+
+        # # common bwd 1
+        # torch.cuda.synchronize()
+        # comm_bwd_start_time = time.time()
+        # ref_loss1.backward()
+        # comm_bwd_end_time = time.time()
+        # print(f"loss_1 comm bwd runtime {comm_bwd_end_time - comm_bwd_start_time}")
+
+
+def test_dx_dw_attn_benchmark():
+    device = "cuda:0"
+    model = Attention(dim=4096).to(device=device)
+    # print(f"model numel {get_model_numel(model)}") # 4GB
+    x1 = torch.rand(1, 256, 4096).to(device=device)
+    # x2 = torch.rand(1, 256, 4096).to(device=device)
+    ref_model = deepcopy(model)
+    ref_x1 = x1.clone()
+    # ref_x2 = x1.clone()
+
+    # first step
+    x1.requires_grad_()
+    # x2.requires_grad_()
+    ref_x1.requires_grad_()
+    # ref_x2.requires_grad_()
+
+    # loss for dx_dw bwd
+    loss1 = model(x1).sum()
+    # loss2 = model(x2).sum()
+
+    # loss for common bwd
+    ref_model(ref_x1).sum()
+    # ref_loss2 = ref_model(ref_x2).sum()
+
+    with torch.profiler.profile(
+        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
+        # schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
+        on_trace_ready=torch.profiler.tensorboard_trace_handler(
+            f"/home/nvme-share/home/duanjunwen/ColossalAI/tests/test_pipeline/test_schedule"
+        ),
+        record_shapes=True,
+        profile_memory=True,
+        with_stack=True,
+        with_flops=True,
+    ) as prof:
+        # dx1
+        torch.cuda.synchronize()
+        bwd_b_start_time = time.time()
+        backward_b(loss1, x1, model)
+        bwd_b_end_time = time.time()
+        print(f"loss_1 bwd B runtime {bwd_b_end_time - bwd_b_start_time}")
+
+        for p in model.parameters():
+            assert p.grad is None
+        assert x1.grad is not None
+
+        # dw1
+        torch.cuda.synchronize()
+        bwd_w_start_time = time.time()
+        backward_w(loss1, model)
+        bwd_w_end_time = time.time()
+        print(f"loss_1 bwd W runtime {bwd_w_end_time - bwd_w_start_time}")
+        for p in model.parameters():
+            assert p.grad is not None
+
+        # # common bwd 1
+        # torch.cuda.synchronize()
+        # comm_bwd_start_time = time.time()
+        # ref_loss1.backward()
+        # comm_bwd_end_time = time.time()
+        # print(f"loss_1 comm bwd runtime {comm_bwd_end_time - comm_bwd_start_time}")
+
+
+if __name__ == "__main__":
+    # test_dx_dw_split()
+    # test_double_dx_dw_split_nsync()
+    # test_double_dx_dw_split_sync()
+    # mem_dx_dw()
+    # activation_dx_dw()
+    # test_dx_dw_linear_benchmark()
+    test_dx_dw_attn_benchmark()

From a11b4b50a78e0f7754c406c3a982863fee71ac58 Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Mon, 14 Oct 2024 07:12:14 +0000
Subject: [PATCH 123/167] [feat] support use_zbv in llama, mixtral modeling;
 only replace Linear1D_Col/Row policy;

---
 .../booster/plugin/hybrid_parallel_plugin.py  |  1 +
 .../plugin/moe_hybrid_parallel_plugin.py      |  1 +
 colossalai/pipeline/weight_grad_store.py      | 70 -------------------
 colossalai/shardformer/policies/llama.py      | 42 +++++++++--
 colossalai/shardformer/policies/mixtral.py    | 40 ++++++++---
 colossalai/shardformer/shard/shard_config.py  |  1 +
 6 files changed, 70 insertions(+), 85 deletions(-)

diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index 5561533e1930..673701017521 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -1217,6 +1217,7 @@ def __init__(
             gradient_checkpoint_config=gradient_checkpoint_config,
             fp8_communication=fp8_communication,
             inner_ring_size=inner_ring_size,
+            use_zbv=(pp_style == "zbv"),
         )
         self.amp_config = dict(
             initial_scale=initial_scale,
diff --git a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
index 8b62a1e2bd8c..b7e65c6a2f78 100644
--- a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
@@ -373,6 +373,7 @@ def __init__(
             make_vocab_size_divisible_by=make_vocab_size_divisible_by,
             gradient_checkpoint_config=gradient_checkpoint_config,
             fp8_communication=fp8_communication,
+            use_zbv=(pp_style == "zbv"),
         )
         self.amp_config = dict(
             initial_scale=initial_scale,
diff --git a/colossalai/pipeline/weight_grad_store.py b/colossalai/pipeline/weight_grad_store.py
index 5d7f76649483..12963350f462 100644
--- a/colossalai/pipeline/weight_grad_store.py
+++ b/colossalai/pipeline/weight_grad_store.py
@@ -34,73 +34,3 @@ def pop(cls, chunk=0):
                     weight.grad = grad_weight
         else:
             raise Exception("Pop empty queue.")
-
-    # @classmethod
-    # def clear(cls, model, chunk=0):
-    #     weight_grad_tasks = []
-    #     while cls.weight_grad_queue[chunk].qsize() > 0:
-    #         stored_grads = cls.weight_grad_queue[chunk].get()
-    #         if len(weight_grad_tasks) == 0:
-    #             for _ in stored_grads:
-    #                 weight_grad_tasks.append([])
-    #         else:
-    #             assert len(weight_grad_tasks) == len(stored_grads)
-    #         for i, task in enumerate(stored_grads):
-    #             weight_grad_tasks[i].append(task)
-    #     weight_params = []
-    #     handles = []
-    #     if get_args().overlap_grad_reduce:
-    #         handles += model.async_reduce_grad()
-
-    #     output_layer_weight = None
-    #     if parallel_state.is_pipeline_last_stage():
-    #         assert len(weight_grad_tasks) > 0
-    #         output_layer_grads = weight_grad_tasks[0]
-    #         for j in range(len(output_layer_grads)):
-    #             total_input, grad_output, weight, func = output_layer_grads[j]
-    #             if output_layer_weight is None:
-    #                 output_layer_weight = weight
-    #             assert output_layer_weight is weight
-    #             func(total_input, grad_output, weight.main_grad)
-    #             output_layer_grads[j] = None  # release memory
-    #         weight_grad_tasks = weight_grad_tasks[1:]
-    #         if get_args().overlap_grad_reduce:
-    #             handles += model.async_reduce_grad(output_layer_weight)
-
-    #     if parallel_state.is_pipeline_first_stage() or parallel_state.is_pipeline_last_stage():
-    #         model_module = get_attr_wrapped_model(model, 'pre_process', return_model_obj=True)
-    #         if model_module.share_embeddings_and_output_weights:
-    #             # if share_embeddings_and_output_weights, wait all-reduce for embeddings
-    #             for handle in handles:
-    #                 if handle is not None:
-    #                     handle.wait()
-    #             handles = []
-
-    #     config = get_model_config(model)
-    #     # Do async all-reduce for embedding grads firstly, so that the rank 0 won't
-    #     # be blocked
-    #     embedding_handles = _allreduce_embedding_grads([model], config, async_op=True)
-    #     handles += embedding_handles
-
-    #     for i in range(len(weight_grad_tasks)):
-    #         tasks = weight_grad_tasks[i]
-    #         param = None
-    #         for j in range(len(tasks)):
-    #             total_input, grad_output, weight, func = tasks[j]
-    #             if param is None:
-    #                 param = weight
-    #             assert param is weight
-    #             assert not (weight is output_layer_weight)
-    #             func(total_input, grad_output, weight.main_grad)
-    #             tasks[j] = None  # release memory
-    #         weight_params.append(param)
-    #         if get_args().overlap_grad_reduce:
-    #             # All-reduce param grad here
-    #             handles += model.async_reduce_grad(param)
-    #         weight_grad_tasks[i] = None  # release memory
-
-    #     # timers('wait_all_reduce', log_level=1).start(barrier=False)
-    #     for handle in embedding_handles:
-    #         if handle is not None:
-    #             handle.wait()
-    #     # timers('wait_all_reduce').stop()
diff --git a/colossalai/shardformer/policies/llama.py b/colossalai/shardformer/policies/llama.py
index f9897b8b757c..5d48a16c3706 100644
--- a/colossalai/shardformer/policies/llama.py
+++ b/colossalai/shardformer/policies/llama.py
@@ -126,37 +126,65 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                     SubModuleReplacementDescription(
                         suffix="self_attn.q_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
+                        kwargs=dict(
+                            seq_parallel_mode=sp_mode,
+                            fp8_communication=self.shard_config.fp8_communication,
+                            use_zbv=self.shard_config.use_zbv,
+                        ),
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.k_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
+                        kwargs=dict(
+                            seq_parallel_mode=sp_mode,
+                            fp8_communication=self.shard_config.fp8_communication,
+                            use_zbv=self.shard_config.use_zbv,
+                        ),
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.v_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
+                        kwargs=dict(
+                            seq_parallel_mode=sp_mode,
+                            fp8_communication=self.shard_config.fp8_communication,
+                            use_zbv=self.shard_config.use_zbv,
+                        ),
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.o_proj",
                         target_module=Linear1D_Row,
-                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
+                        kwargs=dict(
+                            seq_parallel_mode=sp_mode,
+                            fp8_communication=self.shard_config.fp8_communication,
+                            use_zbv=self.shard_config.use_zbv,
+                        ),
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.gate_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
+                        kwargs=dict(
+                            seq_parallel_mode=sp_mode,
+                            fp8_communication=self.shard_config.fp8_communication,
+                            use_zbv=self.shard_config.use_zbv,
+                        ),
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.up_proj",
                         target_module=Linear1D_Col,
-                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
+                        kwargs=dict(
+                            seq_parallel_mode=sp_mode,
+                            fp8_communication=self.shard_config.fp8_communication,
+                            use_zbv=self.shard_config.use_zbv,
+                        ),
                     ),
                     SubModuleReplacementDescription(
                         suffix="mlp.down_proj",
                         target_module=Linear1D_Row,
-                        kwargs=dict(seq_parallel_mode=sp_mode, fp8_communication=self.shard_config.fp8_communication),
+                        kwargs=dict(
+                            seq_parallel_mode=sp_mode,
+                            fp8_communication=self.shard_config.fp8_communication,
+                            use_zbv=self.shard_config.use_zbv,
+                        ),
                     ),
                 ],
             )
diff --git a/colossalai/shardformer/policies/mixtral.py b/colossalai/shardformer/policies/mixtral.py
index 9d8d2b54b32c..705f2b19fd8c 100644
--- a/colossalai/shardformer/policies/mixtral.py
+++ b/colossalai/shardformer/policies/mixtral.py
@@ -124,27 +124,43 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                     SubModuleReplacementDescription(
                         suffix="self_attn.q_proj",
                         target_module=Linear1D_Col,
-                        kwargs={"fp8_communication": self.shard_config.fp8_communication},
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                            "use_zbv": self.shard_config.use_zbv,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.k_proj",
                         target_module=Linear1D_Col,
-                        kwargs={"fp8_communication": self.shard_config.fp8_communication},
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                            "use_zbv": self.shard_config.use_zbv,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.v_proj",
                         target_module=Linear1D_Col,
-                        kwargs={"fp8_communication": self.shard_config.fp8_communication},
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                            "use_zbv": self.shard_config.use_zbv,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="self_attn.o_proj",
                         target_module=Linear1D_Row,
-                        kwargs={"fp8_communication": self.shard_config.fp8_communication},
+                        kwargs={
+                            "fp8_communication": self.shard_config.fp8_communication,
+                            "use_zbv": self.shard_config.use_zbv,
+                        },
                     ),
                     SubModuleReplacementDescription(
                         suffix="block_sparse_moe.gate",
                         target_module=Linear1D_Col,
-                        kwargs={"gather_output": True, "fp8_communication": self.shard_config.fp8_communication},
+                        kwargs={
+                            "gather_output": True,
+                            "fp8_communication": self.shard_config.fp8_communication,
+                            "use_zbv": self.shard_config.use_zbv,
+                        },
                     ),
                 ],
             )
@@ -322,9 +338,13 @@ def module_policy(self):
                         SubModuleReplacementDescription(
                             suffix="lm_head",
                             target_module=Linear1D_Col,
-                            kwargs=dict(gather_output=True, fp8_communication=self.shard_config.fp8_communication),
+                            kwargs=dict(
+                                gather_output=True,
+                                fp8_communication=self.shard_config.fp8_communication,
+                                use_zbv=self.shard_config.use_zbv,
+                            ),
                         )
-                    ]
+                    ],
                 )
             }
             policy.update(new_item)
@@ -380,7 +400,11 @@ def module_policy(self):
                         SubModuleReplacementDescription(
                             suffix="score",
                             target_module=Linear1D_Col,
-                            kwargs=dict(gather_output=True, fp8_communication=self.shard_config.fp8_communication),
+                            kwargs=dict(
+                                gather_output=True,
+                                fp8_communication=self.shard_config.fp8_communication,
+                                use_zbv=self.shard_config.use_zbv,
+                            ),
                         )
                     ]
                 )
diff --git a/colossalai/shardformer/shard/shard_config.py b/colossalai/shardformer/shard/shard_config.py
index 1219119bb095..33e93fa515b7 100644
--- a/colossalai/shardformer/shard/shard_config.py
+++ b/colossalai/shardformer/shard/shard_config.py
@@ -49,6 +49,7 @@ class ShardConfig:
     make_vocab_size_divisible_by: int = 64
     gradient_checkpoint_config: Optional[GradientCheckpointConfig] = None
     extra_kwargs: Dict[str, Any] = field(default_factory=dict)
+    use_zbv: bool = False
 
     # For ring attention
     inner_ring_size: Optional[int] = None

From abd455189ddd3546f889ef9f8b476f421d72438b Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Mon, 14 Oct 2024 07:38:02 +0000
Subject: [PATCH 124/167] [fix] fix test case; moe error in second iter

---
 colossalai/shardformer/layer/_operation.py    |  8 +++--
 colossalai/shardformer/layer/linear.py        | 32 ++++++++++++++++---
 .../test_schedule/test_zerobubble_pp.py       | 11 ++++---
 3 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index 626a009ec430..9d3d91034c45 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -129,7 +129,7 @@ class LinearWithAsyncCommunication(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, input_, weight, bias, process_group, async_grad_allreduce, fp8_communication=False, use_zbv=True):
+    def forward(ctx, input_, weight, bias, process_group, async_grad_allreduce, fp8_communication=False, use_zbv=False):
         ctx.save_for_backward(input_, weight, bias)
         ctx.use_bias = bias is not None
         ctx.process_group = process_group
@@ -1094,9 +1094,11 @@ def matmul_with_async_comm(input_, weight, bias, process_group, async_grad_allre
     )
 
 
-def linear_with_async_comm(input_, weight, bias, process_group, async_grad_allreduce, fp8_communication=False):
+def linear_with_async_comm(
+    input_, weight, bias, process_group, async_grad_allreduce, fp8_communication=False, use_zbv=False
+):
     return LinearWithAsyncCommunication.apply(
-        input_, weight, bias, process_group, async_grad_allreduce, fp8_communication
+        input_, weight, bias, process_group, async_grad_allreduce, fp8_communication, use_zbv
     )
 
 
diff --git a/colossalai/shardformer/layer/linear.py b/colossalai/shardformer/layer/linear.py
index 25f4228a4e62..cb3ad0b45260 100644
--- a/colossalai/shardformer/layer/linear.py
+++ b/colossalai/shardformer/layer/linear.py
@@ -85,6 +85,7 @@ def __init__(
         weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
         bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
         fp8_communication: bool = False,
+        use_zbv: bool = False,
         **kwargs,
     ):
         super().__init__(weight=weight, bias_=bias_, **kwargs)
@@ -100,6 +101,7 @@ def __init__(
         self.device = device
         self.process_group = process_group
         self.fp8_communication = fp8_communication
+        self.use_zbv = use_zbv
 
         if skip_bias_add and not bias:
             raise ValueError("cannot skip bias addition if bias is None")
@@ -206,7 +208,13 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
                 input_parallel, self.process_group, self.seq_parallel_dim, fp8_communication=self.fp8_communication
             )
             output_parallel = linear_with_async_comm(
-                input_parallel, self.weight, bias, self.process_group, False, fp8_communication=self.fp8_communication
+                input_parallel,
+                self.weight,
+                bias,
+                self.process_group,
+                False,
+                fp8_communication=self.fp8_communication,
+                use_zbv=self.use_zbv,
             )
         elif self.seq_parallel_mode == "ring":
             output_parallel = linear_gather_forward_reducescatter_backward(
@@ -214,7 +222,13 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
             )
         else:
             output_parallel = linear_with_async_comm(
-                input_parallel, self.weight, bias, self.process_group, True, fp8_communication=self.fp8_communication
+                input_parallel,
+                self.weight,
+                bias,
+                self.process_group,
+                True,
+                fp8_communication=self.fp8_communication,
+                use_zbv=self.use_zbv,
             )
 
         if self.gather_output:
@@ -272,6 +286,7 @@ def __init__(
         bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
         stream_chunk_num: int = 1,
         fp8_communication: bool = False,
+        use_zbv: bool = False,
     ):
         super().__init__()
 
@@ -287,6 +302,7 @@ def __init__(
         self.seq_parallel_dim = seq_parallel_dim
         self.num_partitions = dist.get_world_size(self.process_group)
         self.fp8_communication = fp8_communication
+        self.use_zbv = use_zbv
 
         if skip_bias_add and not bias:
             raise ValueError("cannot skip bias addition if bias is None")
@@ -428,10 +444,14 @@ def forward(self, input_: Tensor) -> Tensor:
                 output = torch.cat(output_parallel_list, dim=-1)
         else:
             if self.seq_parallel_mode is None:
-                output_parallel = linear_with_async_comm(input_, self.weight, None, self.process_group, False)
+                output_parallel = linear_with_async_comm(
+                    input_, self.weight, None, self.process_group, False, use_zbv=self.use_zbv
+                )
                 output = reduce_forward(output_parallel, self.process_group, fp8_communication=self.fp8_communication)
             elif self.seq_parallel_mode == "split_gather":
-                output_parallel = linear_with_async_comm(input_, self.weight, None, self.process_group, False)
+                output_parallel = linear_with_async_comm(
+                    input_, self.weight, None, self.process_group, False, use_zbv=self.use_zbv
+                )
                 output = reducescatter_forward_gather_backward(
                     output_parallel, self.process_group, self.seq_parallel_dim, fp8_communication=self.fp8_communication
                 )
@@ -444,7 +464,9 @@ def forward(self, input_: Tensor) -> Tensor:
                     ring=True,
                 )
             else:
-                output_parallel = linear_with_async_comm(input_, self.weight, None, self.process_group, False)
+                output_parallel = linear_with_async_comm(
+                    input_, self.weight, None, self.process_group, False, use_zbv=self.use_zbv
+                )
                 output = reduce_forward(output_parallel, self.process_group)
 
         if not self.skip_bias_add:
diff --git a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
index 4225da802d78..fb59e0b2cc30 100644
--- a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
+++ b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
@@ -752,8 +752,9 @@ def run_with_hybridplugin(test_config):
 @parameterize(
     "config",
     [
-        (0, 1, 4, 1, 1),
-        (1, 2, 2, 1, 1),
+        # TODO:ERR in second iter
+        # (0, 1, 4, 1, 1),
+        # (1, 2, 2, 1, 1),
         (1, 2, 1, 2, 1),
         (1, 2, 1, 1, 2),
     ],
@@ -905,9 +906,9 @@ def run_with_booster_moehybridplugin(config: Tuple[int, ...]):
         torch_optimizer.zero_grad()
         assert_loose_close(parallel_output, torch_output_sum, dtype=dtype)
         print(f"rank {dist.get_rank()} config {test_config}  test passed")
-        clear_layout_converter()
-        Randomizer.reset_index()
-        torch.cuda.empty_cache()
+    clear_layout_converter()
+    Randomizer.reset_index()
+    torch.cuda.empty_cache()
 
 
 def run_dist(rank, world_size, port):

From 160e9a41758fe609a133f9331d4763e25196ad82 Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Mon, 14 Oct 2024 08:22:51 +0000
Subject: [PATCH 125/167] [feat]EPMixtralSparseMoeBlock (op in MOE) support
 zbv;

---
 colossalai/shardformer/modeling/mixtral.py | 8 +++++---
 colossalai/shardformer/policies/mixtral.py | 1 +
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/colossalai/shardformer/modeling/mixtral.py b/colossalai/shardformer/modeling/mixtral.py
index a783b5c5eb26..3687cfb99c5f 100644
--- a/colossalai/shardformer/modeling/mixtral.py
+++ b/colossalai/shardformer/modeling/mixtral.py
@@ -60,6 +60,7 @@ def setup_process_groups(
         moe_dp_group: ProcessGroup,
         ep_group: ProcessGroup,
         fp8_communication: bool = False,
+        use_zbv: bool = False,
     ):
         assert tp_group is not None
         assert moe_dp_group is not None
@@ -70,6 +71,7 @@ def setup_process_groups(
         self.ep_rank = dist.get_rank(ep_group)
         self.ep_group = ep_group
         self.fp8_communication = fp8_communication
+        self.use_zbv = use_zbv
 
         if self.num_experts % self.ep_size != 0:
             raise ValueError("The number of experts must be divisible by the number of expert parallel groups.")
@@ -89,13 +91,13 @@ def setup_process_groups(
         if self.tp_group.size() > 1:
             for expert in held_experts:
                 expert.w1 = Linear1D_Col.from_native_module(
-                    expert.w1, self.tp_group, fp8_communication=self.fp8_communication
+                    expert.w1, self.tp_group, fp8_communication=self.fp8_communication, use_zbv=self.use_zbv
                 )
                 expert.w3 = Linear1D_Col.from_native_module(
-                    expert.w3, self.tp_group, fp8_communication=self.fp8_communication
+                    expert.w3, self.tp_group, fp8_communication=self.fp8_communication, use_zbv=self.use_zbv
                 )
                 expert.w2 = Linear1D_Row.from_native_module(
-                    expert.w2, self.tp_group, fp8_communication=self.fp8_communication
+                    expert.w2, self.tp_group, fp8_communication=self.fp8_communication, use_zbv=self.use_zbv
                 )
 
         for p in self.experts.parameters():
diff --git a/colossalai/shardformer/policies/mixtral.py b/colossalai/shardformer/policies/mixtral.py
index 705f2b19fd8c..de546b3c5119 100644
--- a/colossalai/shardformer/policies/mixtral.py
+++ b/colossalai/shardformer/policies/mixtral.py
@@ -195,6 +195,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                             "tp_group": self.shard_config.tensor_parallel_process_group,
                             "moe_dp_group": self.shard_config.moe_dp_group,
                             "fp8_communication": self.shard_config.fp8_communication,
+                            "use_zbv": self.shard_config.use_zbv,
                         },
                     )
                 ],

From 23199e34cc0ae6fdd6c9297ef088bf4dc4713b52 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Mon, 14 Oct 2024 18:01:53 +0800
Subject: [PATCH 126/167] fix

---
 colossalai/shardformer/layer/attn.py          | 63 ++++++-------------
 colossalai/shardformer/modeling/gpt2.py       |  2 -
 colossalai/shardformer/modeling/llama.py      |  3 -
 .../test_layer/test_ring_attn.py              |  8 +--
 4 files changed, 20 insertions(+), 56 deletions(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 36a239142d62..c49113a2466e 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -7,6 +7,7 @@
 import torch.nn.functional as F
 from einops import rearrange
 
+from colossalai.cluster import ProcessGroupMesh
 from colossalai.kernel.kernel_loader import (
     FlashAttentionDaoLoader,
     FlashAttentionForFloatAndCustomMaskLoader,
@@ -431,7 +432,7 @@ class RingAttention(torch.autograd.Function):
     INTER_RING_GROUP_COPY: dist.ProcessGroup = None
 
     @staticmethod
-    def get_double_ring_groups(sp_group, tp_group, inner_ring_size=None):
+    def get_double_ring_groups(sp_group, inner_ring_size=None):
         """
         Get 2D ring groups for the given process group. Generally, to avoid congestion, the inner ring size
         shouldn't be larger than the number of NICs on each node.
@@ -442,7 +443,6 @@ def get_double_ring_groups(sp_group, tp_group, inner_ring_size=None):
             Tuple[dist.ProcessGroup, dist.ProcessGroup]: Inner-ring process group and inter-ring process group.
         """
         sp_size = dist.get_world_size(sp_group)
-        tp_size = dist.get_world_size(tp_group) if tp_group is not None else 1
         sp_rank = dist.get_rank(sp_group)
 
         assert inner_ring_size is not None
@@ -465,45 +465,22 @@ def get_double_ring_groups(sp_group, tp_group, inner_ring_size=None):
         inner_ring_group = None
         inter_ring_group = None
 
-        world_size = dist.get_world_size()
-        rank = dist.get_rank()
-
-        num_ring_size = world_size // num_rings
-
-        if tp_size > 1:
-            # Create inner ring groups
-            ranks = []
-            for i in range(num_rings):
-                start = i * num_ring_size
-                end = (i + 1) * num_ring_size
-                for idx in range(start, end):
-                    inner_rank = [idx + k * tp_size for k in range(inner_ring_size) if idx + k * tp_size < end]
-                    if len(inner_rank) == inner_ring_size and inner_rank not in ranks:
-                        ranks.append(inner_rank)
-                        group = dist.new_group(inner_rank)
-                        if rank in inner_rank:
-                            inner_ring_group = group
-            # Create inter ring groups
-            for i in range(num_ring_size):
-                inter_rank = [i + j * num_ring_size for j in range(num_rings)]
-                group = dist.new_group(inter_rank)
-                if rank in inter_rank:
-                    inter_ring_group = group
+        inter_axis, inner_axis = 0, 1
+        pg_mesh = ProcessGroupMesh(num_rings, inner_ring_size)
 
-        else:
-            # Create inner ring groups
-            for i in range(inner_ring_size):
-                ranks = list(range(i * inner_ring_size, (i + 1) * inner_ring_size))
-                group = dist.new_group(ranks)
-                if sp_rank in ranks:
-                    inner_ring_group = group
-
-            # Create inter ring groups
-            for i in range(num_rings):
-                ranks = list(range(i, sp_size, num_rings))
-                group = dist.new_group(ranks)
-                if sp_rank in ranks:
-                    inter_ring_group = group
+        # Create inner ring groups
+        for i in range(inner_ring_size):
+            ranks = list(range(i * inner_ring_size, (i + 1) * inner_ring_size))
+            group = pg_mesh.get_group_along_axis(inner_axis)
+            if sp_rank in ranks:
+                inner_ring_group = group
+
+        # Create inter ring groups
+        for i in range(num_rings):
+            ranks = list(range(i, sp_size, num_rings))
+            group = pg_mesh.get_group_along_axis(inter_axis)
+            if sp_rank in ranks:
+                inter_ring_group = group
 
         return inner_ring_group, inter_ring_group
 
@@ -522,7 +499,6 @@ def attention(
         deterministic=False,
         return_softmax=False,
         inner_ring_size=None,
-        tp_group=None,
         **kwargs,
     ):
         """
@@ -570,9 +546,7 @@ def attention(
 
         if inner_ring_size != None:
             RingAttention.SP_GROUP = sp_group
-            inner_ring_group, inter_ring_group = RingAttention.get_double_ring_groups(
-                sp_group, tp_group, inner_ring_size
-            )
+            inner_ring_group, inter_ring_group = RingAttention.get_double_ring_groups(sp_group, inner_ring_size)
             RingAttention.INNER_RING_GROUP = inner_ring_group
             RingAttention.INTER_RING_GROUP = inter_ring_group
         else:
@@ -619,7 +593,6 @@ def attention(
             attention_mask_type == AttnMaskType.PADDED_CAUSAL,
             inner_ring_group,
             inter_ring_group,
-            tp_group,
         )
 
         if attention_mask_type == AttnMaskType.PADDED_CAUSAL:
diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index 43b3d2c5e52f..aec8e84a07c6 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -858,7 +858,6 @@ def forward(
 
         sp_mode = shard_config.sequence_parallelism_mode
         sp_group = shard_config.sequence_parallel_process_group
-        tp_group = shard_config.tensor_parallel_process_group
 
         if sp_mode == "ring_attn":
             attn_output = RingAttention.attention(
@@ -870,7 +869,6 @@ def forward(
                 dropout_p=dropout_p,
                 scale=scale,
                 inner_ring_size=shard_config.inner_ring_size,
-                tp_group=tp_group,
             )
         else:
             attn_output = ColoAttention.attention(query, key, value, **attention_mask, dropout_p=dropout_p, scale=scale)
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index aa761fd211e7..47c17e7494f2 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -563,8 +563,6 @@ def forward(
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
-        tp_group = shard_config.tensor_parallel_process_group
-
         if sp_mode == "ring_attn":
             attn_output = RingAttention.attention(
                 query_states,
@@ -573,7 +571,6 @@ def forward(
                 sp_group,
                 **attention_mask,
                 inner_ring_size=shard_config.inner_ring_size,
-                tp_group=tp_group,
             )
 
         elif shard_config.enable_flash_attention:
diff --git a/tests/test_shardformer/test_layer/test_ring_attn.py b/tests/test_shardformer/test_layer/test_ring_attn.py
index b9291a06199a..61ea0677fcac 100644
--- a/tests/test_shardformer/test_layer/test_ring_attn.py
+++ b/tests/test_shardformer/test_layer/test_ring_attn.py
@@ -5,7 +5,6 @@
 from torch.testing import assert_close
 
 import colossalai
-from colossalai.cluster import ProcessGroupMesh
 from colossalai.shardformer.layer import AttnMaskType
 from colossalai.shardformer.layer.attn import AttnMaskType, RingAttention
 from colossalai.shardformer.layer.utils import split_batch_zigzag, split_varlen_zigzag
@@ -21,10 +20,8 @@
 def check_ring_attn(seq_len, bs, nheads, d, dtype, sp_size, tp_size=1):
     torch.cuda.manual_seed(2)
     device = get_current_device()
-    sp_axis, tp_axis = 0, 1
-    pg_mesh = ProcessGroupMesh(sp_size, tp_size)
-    tp_group = pg_mesh.get_group_along_axis(tp_axis)
-    sp_group = pg_mesh.get_group_along_axis(sp_axis)
+    sp_group = dist.group.WORLD
+    sp_size = dist.get_world_size()
     # Some outliers may seem large, but our errors are still lower than
     # than Megatron-LM context parallel's
     # (https://github.com/NVIDIA/TransformerEngine/blob/33a3d02f81c56e6f7b542c09bfa86657078d57fb/tests/pytorch/fused_attn/run_fused_attn_with_cp.py#L215)
@@ -47,7 +44,6 @@ def check_ring_attn(seq_len, bs, nheads, d, dtype, sp_size, tp_size=1):
         AttnMaskType.CAUSAL,
         return_softmax=True,
         inner_ring_size=max(2, sp_size // 2),
-        tp_group=tp_group,
     )
     ring_out = ring_out.transpose(1, 2)
     out, lse, _ = flash_attn_qkvpacked_func(

From 3201377e94d3cc8d2753214e4a136944f230ea42 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Mon, 14 Oct 2024 18:06:24 +0800
Subject: [PATCH 127/167] fix

---
 colossalai/shardformer/modeling/gpt2.py             | 1 -
 tests/test_shardformer/test_layer/test_ring_attn.py | 6 +++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index aec8e84a07c6..798fca88fb4f 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -858,7 +858,6 @@ def forward(
 
         sp_mode = shard_config.sequence_parallelism_mode
         sp_group = shard_config.sequence_parallel_process_group
-
         if sp_mode == "ring_attn":
             attn_output = RingAttention.attention(
                 query,
diff --git a/tests/test_shardformer/test_layer/test_ring_attn.py b/tests/test_shardformer/test_layer/test_ring_attn.py
index 61ea0677fcac..bcb2c1f8a040 100644
--- a/tests/test_shardformer/test_layer/test_ring_attn.py
+++ b/tests/test_shardformer/test_layer/test_ring_attn.py
@@ -17,7 +17,7 @@
 @parameterize("nheads", [5])
 @parameterize("d", [128])
 @parameterize("dtype", [torch.bfloat16, torch.float16])
-def check_ring_attn(seq_len, bs, nheads, d, dtype, sp_size, tp_size=1):
+def check_ring_attn(seq_len, bs, nheads, d, dtype, sp_size):
     torch.cuda.manual_seed(2)
     device = get_current_device()
     sp_group = dist.group.WORLD
@@ -165,7 +165,7 @@ def launch_single_ring(rank, world_size, port):
 
 def launch_double_ring(rank, world_size, port):
     colossalai.launch(rank, world_size, "localhost", port)
-    check_ring_attn(sp_size=4, tp_size=2)
+    check_ring_attn(sp_size=world_size)
 
 
 @rerun_if_address_is_in_use()
@@ -175,7 +175,7 @@ def test_ring_attn(world_size):
 
 
 @rerun_if_address_is_in_use()
-@parameterize("world_size", [8])
+@parameterize("world_size", [4])
 def test_double_ring(world_size):
     spawn(launch_double_ring, nprocs=world_size)
 

From fe9208feaca07159dbff387342346df510ab8a0d Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Mon, 14 Oct 2024 18:07:56 +0800
Subject: [PATCH 128/167] fix

---
 colossalai/shardformer/layer/attn.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index c49113a2466e..1f39a44b3f43 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -623,7 +623,6 @@ def forward(
         is_packed: Optional[bool] = False,
         inner_ring_group: Optional[dist.ProcessGroup] = None,
         inter_ring_group: Optional[dist.ProcessGroup] = None,
-        tp_group: Optional[dist.ProcessGroup] = None,
     ):
 
         cu_seqlens_q = cu_seqlens_kv = cu_seqlens
@@ -1120,7 +1119,7 @@ def _other_ring_backward(ring_num_idx, dq):
         if not is_packed:
             dq, dk, dv = [x.view(b, sq, *x.shape[-2:]) for x in (dq, dk, dv)]
 
-        return (dq, dk, dv, None, None, None, None, None, None, None, None, None, None, None, None)
+        return (dq, dk, dv, None, None, None, None, None, None, None, None, None, None, None)
 
     @staticmethod
     def prepare_varlen_batch(

From 8ff7d0c78048b4231e1f772a8227282ae7d5822a Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Mon, 14 Oct 2024 18:16:03 +0800
Subject: [PATCH 129/167] fix

---
 tests/test_shardformer/test_layer/test_ring_attn.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/test_shardformer/test_layer/test_ring_attn.py b/tests/test_shardformer/test_layer/test_ring_attn.py
index bcb2c1f8a040..0ffea2016573 100644
--- a/tests/test_shardformer/test_layer/test_ring_attn.py
+++ b/tests/test_shardformer/test_layer/test_ring_attn.py
@@ -17,11 +17,10 @@
 @parameterize("nheads", [5])
 @parameterize("d", [128])
 @parameterize("dtype", [torch.bfloat16, torch.float16])
-def check_ring_attn(seq_len, bs, nheads, d, dtype, sp_size):
+def check_ring_attn(seq_len, bs, nheads, d, dtype, inner_ring_size):
     torch.cuda.manual_seed(2)
     device = get_current_device()
     sp_group = dist.group.WORLD
-    sp_size = dist.get_world_size()
     # Some outliers may seem large, but our errors are still lower than
     # than Megatron-LM context parallel's
     # (https://github.com/NVIDIA/TransformerEngine/blob/33a3d02f81c56e6f7b542c09bfa86657078d57fb/tests/pytorch/fused_attn/run_fused_attn_with_cp.py#L215)
@@ -43,7 +42,7 @@ def check_ring_attn(seq_len, bs, nheads, d, dtype, sp_size):
         sp_group,
         AttnMaskType.CAUSAL,
         return_softmax=True,
-        inner_ring_size=max(2, sp_size // 2),
+        inner_ring_size=inner_ring_size,
     )
     ring_out = ring_out.transpose(1, 2)
     out, lse, _ = flash_attn_qkvpacked_func(
@@ -160,12 +159,12 @@ def check_packed_seq(seqlen, bs, nheads, d, dtype):
 def launch_single_ring(rank, world_size, port):
     colossalai.launch(rank, world_size, "localhost", port)
     check_packed_seq()
-    check_ring_attn(sp_size=world_size)
+    check_ring_attn(inner_ring_size=None)
 
 
 def launch_double_ring(rank, world_size, port):
     colossalai.launch(rank, world_size, "localhost", port)
-    check_ring_attn(sp_size=world_size)
+    check_ring_attn(inner_ring_size=2)
 
 
 @rerun_if_address_is_in_use()

From 3dc08c8a5a03d6bb28d865d2dbaae948c27e7e9a Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Tue, 15 Oct 2024 11:01:34 +0800
Subject: [PATCH 130/167] fix

---
 .../booster/plugin/hybrid_parallel_plugin.py      |  2 ++
 colossalai/shardformer/layer/attn.py              | 15 +++++++--------
 colossalai/shardformer/modeling/gpt2.py           |  1 +
 colossalai/shardformer/modeling/llama.py          |  1 +
 colossalai/shardformer/shard/shard_config.py      |  1 +
 .../test_shardformer/test_layer/test_ring_attn.py | 13 ++++++-------
 6 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index bb663f6a6d5e..d15a9f397703 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -1180,7 +1180,9 @@ def __init__(
             gradient_checkpoint_config=gradient_checkpoint_config,
             fp8_communication=fp8_communication,
             inner_ring_size=inner_ring_size,
+            pg_mesh=self.pg_mesh,
         )
+
         self.amp_config = dict(
             initial_scale=initial_scale,
             growth_factor=growth_factor,
diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 1f39a44b3f43..6a972f075a6f 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -7,7 +7,6 @@
 import torch.nn.functional as F
 from einops import rearrange
 
-from colossalai.cluster import ProcessGroupMesh
 from colossalai.kernel.kernel_loader import (
     FlashAttentionDaoLoader,
     FlashAttentionForFloatAndCustomMaskLoader,
@@ -432,7 +431,7 @@ class RingAttention(torch.autograd.Function):
     INTER_RING_GROUP_COPY: dist.ProcessGroup = None
 
     @staticmethod
-    def get_double_ring_groups(sp_group, inner_ring_size=None):
+    def get_double_ring_groups(sp_group, pg_mesh, inner_ring_size=None):
         """
         Get 2D ring groups for the given process group. Generally, to avoid congestion, the inner ring size
         shouldn't be larger than the number of NICs on each node.
@@ -465,20 +464,17 @@ def get_double_ring_groups(sp_group, inner_ring_size=None):
         inner_ring_group = None
         inter_ring_group = None
 
-        inter_axis, inner_axis = 0, 1
-        pg_mesh = ProcessGroupMesh(num_rings, inner_ring_size)
-
         # Create inner ring groups
         for i in range(inner_ring_size):
             ranks = list(range(i * inner_ring_size, (i + 1) * inner_ring_size))
-            group = pg_mesh.get_group_along_axis(inner_axis)
+            group = pg_mesh.get_group_along_axis(2, ranks)
             if sp_rank in ranks:
                 inner_ring_group = group
 
         # Create inter ring groups
         for i in range(num_rings):
             ranks = list(range(i, sp_size, num_rings))
-            group = pg_mesh.get_group_along_axis(inter_axis)
+            group = pg_mesh.get_group_along_axis(2, ranks)
             if sp_rank in ranks:
                 inter_ring_group = group
 
@@ -499,6 +495,7 @@ def attention(
         deterministic=False,
         return_softmax=False,
         inner_ring_size=None,
+        pg_mesh=None,
         **kwargs,
     ):
         """
@@ -546,7 +543,9 @@ def attention(
 
         if inner_ring_size != None:
             RingAttention.SP_GROUP = sp_group
-            inner_ring_group, inter_ring_group = RingAttention.get_double_ring_groups(sp_group, inner_ring_size)
+            inner_ring_group, inter_ring_group = RingAttention.get_double_ring_groups(
+                sp_group, pg_mesh, inner_ring_size
+            )
             RingAttention.INNER_RING_GROUP = inner_ring_group
             RingAttention.INTER_RING_GROUP = inter_ring_group
         else:
diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index 798fca88fb4f..d75e9b14ae52 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -868,6 +868,7 @@ def forward(
                 dropout_p=dropout_p,
                 scale=scale,
                 inner_ring_size=shard_config.inner_ring_size,
+                pg_mesh=shard_config.pg_mesh,
             )
         else:
             attn_output = ColoAttention.attention(query, key, value, **attention_mask, dropout_p=dropout_p, scale=scale)
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index 47c17e7494f2..ff7390643379 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -571,6 +571,7 @@ def forward(
                 sp_group,
                 **attention_mask,
                 inner_ring_size=shard_config.inner_ring_size,
+                pg_mesh=shard_config.pg_mesh,
             )
 
         elif shard_config.enable_flash_attention:
diff --git a/colossalai/shardformer/shard/shard_config.py b/colossalai/shardformer/shard/shard_config.py
index 1219119bb095..14963f7a5f9d 100644
--- a/colossalai/shardformer/shard/shard_config.py
+++ b/colossalai/shardformer/shard/shard_config.py
@@ -51,6 +51,7 @@ class ShardConfig:
     extra_kwargs: Dict[str, Any] = field(default_factory=dict)
 
     # For ring attention
+    pg_mesh: Optional[int] = None
     inner_ring_size: Optional[int] = None
     # for moe related
     moe_dp_group: Optional[ProcessGroup] = None
diff --git a/tests/test_shardformer/test_layer/test_ring_attn.py b/tests/test_shardformer/test_layer/test_ring_attn.py
index 0ffea2016573..48f25dea3960 100644
--- a/tests/test_shardformer/test_layer/test_ring_attn.py
+++ b/tests/test_shardformer/test_layer/test_ring_attn.py
@@ -5,6 +5,7 @@
 from torch.testing import assert_close
 
 import colossalai
+from colossalai.cluster import ProcessGroupMesh
 from colossalai.shardformer.layer import AttnMaskType
 from colossalai.shardformer.layer.attn import AttnMaskType, RingAttention
 from colossalai.shardformer.layer.utils import split_batch_zigzag, split_varlen_zigzag
@@ -21,6 +22,9 @@ def check_ring_attn(seq_len, bs, nheads, d, dtype, inner_ring_size):
     torch.cuda.manual_seed(2)
     device = get_current_device()
     sp_group = dist.group.WORLD
+    dp_size, pp_size, tp_size = 1, 1, 1
+    sp_size = dist.get_world_size()
+    pg_mesh = ProcessGroupMesh(dp_size, pp_size, sp_size, tp_size)
     # Some outliers may seem large, but our errors are still lower than
     # than Megatron-LM context parallel's
     # (https://github.com/NVIDIA/TransformerEngine/blob/33a3d02f81c56e6f7b542c09bfa86657078d57fb/tests/pytorch/fused_attn/run_fused_attn_with_cp.py#L215)
@@ -36,13 +40,7 @@ def check_ring_attn(seq_len, bs, nheads, d, dtype, inner_ring_size):
 
     # Ring attention vs single GPU
     ring_out, ring_lse = RingAttention.attention(
-        q,
-        k,
-        v,
-        sp_group,
-        AttnMaskType.CAUSAL,
-        return_softmax=True,
-        inner_ring_size=inner_ring_size,
+        q, k, v, sp_group, AttnMaskType.CAUSAL, return_softmax=True, inner_ring_size=inner_ring_size, pg_mesh=pg_mesh
     )
     ring_out = ring_out.transpose(1, 2)
     out, lse, _ = flash_attn_qkvpacked_func(
@@ -125,6 +123,7 @@ def check_packed_seq(seqlen, bs, nheads, d, dtype):
         **mask_info,
         pad_output=False,
         return_softmax=True,
+        pg_mesh=dist.group.WORLD,
         # deterministic=True
     )
     ring_out = ring_out.transpose(1, 2).reshape(-1, nheads, d)

From 6be9862aafb4230ead0bcb95c65b5568043e2c13 Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Tue, 15 Oct 2024 11:56:49 +0800
Subject: [PATCH 131/167] fix

---
 .../booster/plugin/hybrid_parallel_plugin.py       |  1 +
 colossalai/shardformer/layer/attn.py               | 14 +++++++++-----
 colossalai/shardformer/modeling/gpt2.py            |  1 +
 colossalai/shardformer/modeling/llama.py           |  1 +
 colossalai/shardformer/shard/shard_config.py       |  1 +
 .../test_shardformer/test_layer/test_ring_attn.py  | 11 ++++++++++-
 6 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index d15a9f397703..de0cd242fb6d 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -1181,6 +1181,7 @@ def __init__(
             fp8_communication=fp8_communication,
             inner_ring_size=inner_ring_size,
             pg_mesh=self.pg_mesh,
+            sp_axis=self.sp_axis,
         )
 
         self.amp_config = dict(
diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 6a972f075a6f..1d8a89ce0de2 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -431,7 +431,7 @@ class RingAttention(torch.autograd.Function):
     INTER_RING_GROUP_COPY: dist.ProcessGroup = None
 
     @staticmethod
-    def get_double_ring_groups(sp_group, pg_mesh, inner_ring_size=None):
+    def get_double_ring_groups(sp_axis, pg_mesh, inner_ring_size=None):
         """
         Get 2D ring groups for the given process group. Generally, to avoid congestion, the inner ring size
         shouldn't be larger than the number of NICs on each node.
@@ -441,6 +441,9 @@ def get_double_ring_groups(sp_group, pg_mesh, inner_ring_size=None):
         Returns:
             Tuple[dist.ProcessGroup, dist.ProcessGroup]: Inner-ring process group and inter-ring process group.
         """
+        assert pg_mesh is not None, f"Error: The pg mesh is None! please check the process group initialization."
+
+        sp_group = pg_mesh.get_group_along_axis(sp_axis)
         sp_size = dist.get_world_size(sp_group)
         sp_rank = dist.get_rank(sp_group)
 
@@ -496,6 +499,7 @@ def attention(
         return_softmax=False,
         inner_ring_size=None,
         pg_mesh=None,
+        sp_axis=None,
         **kwargs,
     ):
         """
@@ -506,7 +510,7 @@ def attention(
             q (torch.Tensor): Query tensor. Shape should be [B, nHeads, Sq, D]
             k (torch.Tensor): Key tensor. Shape should be [B, nHeads, Sq, Sq, D]
             v (torch.Tensor): Value tensor. Shape should be [B, nHeads, Sq, Sq, D]
-            sp_group (Optional[dist.ProcessGroup]): Process group for sequence parallelism
+            sp_axis (Optional[int]): Sp axis for the global pg mesh.
             sp_tream (torch.cuda.Stream): An different stream for output correction.
             cu_seqlens (Optional[torch.Tensor], optional): The cumulative sequence lengths
                 of the sequences in the batch, used to index into q.
@@ -539,13 +543,13 @@ def attention(
             attention_mask_type in RingAttention.SUPPORTED_MASK_TYPES
         ), f"Mask type {attention_mask_type} is not supported yet."
 
+        assert pg_mesh is not None, f"Error: The pg mesh is None! please check the process group initialization."
+
         clone_pg = lambda pg: dist.new_group(dist.get_process_group_ranks(pg))
 
         if inner_ring_size != None:
             RingAttention.SP_GROUP = sp_group
-            inner_ring_group, inter_ring_group = RingAttention.get_double_ring_groups(
-                sp_group, pg_mesh, inner_ring_size
-            )
+            inner_ring_group, inter_ring_group = RingAttention.get_double_ring_groups(sp_axis, pg_mesh, inner_ring_size)
             RingAttention.INNER_RING_GROUP = inner_ring_group
             RingAttention.INTER_RING_GROUP = inter_ring_group
         else:
diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index d75e9b14ae52..1eaed167ccf7 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -869,6 +869,7 @@ def forward(
                 scale=scale,
                 inner_ring_size=shard_config.inner_ring_size,
                 pg_mesh=shard_config.pg_mesh,
+                sp_axis=shard_config.sp_axis,
             )
         else:
             attn_output = ColoAttention.attention(query, key, value, **attention_mask, dropout_p=dropout_p, scale=scale)
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index ff7390643379..ca0751f0474a 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -572,6 +572,7 @@ def forward(
                 **attention_mask,
                 inner_ring_size=shard_config.inner_ring_size,
                 pg_mesh=shard_config.pg_mesh,
+                sp_axis=shard_config.sp_axis,
             )
 
         elif shard_config.enable_flash_attention:
diff --git a/colossalai/shardformer/shard/shard_config.py b/colossalai/shardformer/shard/shard_config.py
index 14963f7a5f9d..7b31e6928ae2 100644
--- a/colossalai/shardformer/shard/shard_config.py
+++ b/colossalai/shardformer/shard/shard_config.py
@@ -51,6 +51,7 @@ class ShardConfig:
     extra_kwargs: Dict[str, Any] = field(default_factory=dict)
 
     # For ring attention
+    sp_axis: Optional[int] = None
     pg_mesh: Optional[int] = None
     inner_ring_size: Optional[int] = None
     # for moe related
diff --git a/tests/test_shardformer/test_layer/test_ring_attn.py b/tests/test_shardformer/test_layer/test_ring_attn.py
index 48f25dea3960..23e8e5b78c1d 100644
--- a/tests/test_shardformer/test_layer/test_ring_attn.py
+++ b/tests/test_shardformer/test_layer/test_ring_attn.py
@@ -24,6 +24,7 @@ def check_ring_attn(seq_len, bs, nheads, d, dtype, inner_ring_size):
     sp_group = dist.group.WORLD
     dp_size, pp_size, tp_size = 1, 1, 1
     sp_size = dist.get_world_size()
+    sp_axis = 2
     pg_mesh = ProcessGroupMesh(dp_size, pp_size, sp_size, tp_size)
     # Some outliers may seem large, but our errors are still lower than
     # than Megatron-LM context parallel's
@@ -40,7 +41,15 @@ def check_ring_attn(seq_len, bs, nheads, d, dtype, inner_ring_size):
 
     # Ring attention vs single GPU
     ring_out, ring_lse = RingAttention.attention(
-        q, k, v, sp_group, AttnMaskType.CAUSAL, return_softmax=True, inner_ring_size=inner_ring_size, pg_mesh=pg_mesh
+        q,
+        k,
+        v,
+        sp_group,
+        AttnMaskType.CAUSAL,
+        return_softmax=True,
+        inner_ring_size=inner_ring_size,
+        pg_mesh=pg_mesh,
+        sp_axis=sp_axis,
     )
     ring_out = ring_out.transpose(1, 2)
     out, lse, _ = flash_attn_qkvpacked_func(

From fd92789af27d3d7269529508c316db62040568aa Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Tue, 15 Oct 2024 13:26:44 +0800
Subject: [PATCH 132/167] fix

---
 colossalai/shardformer/layer/attn.py                | 5 ++---
 colossalai/shardformer/modeling/gpt2.py             | 5 ++---
 colossalai/shardformer/modeling/llama.py            | 3 +--
 tests/test_shardformer/test_layer/test_ring_attn.py | 8 ++++----
 4 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 1d8a89ce0de2..1a175f426c8f 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -488,7 +488,7 @@ def attention(
         q,  # (B, H, Sq, D)
         k,
         v,
-        sp_group,
+        sp_axis,
         attention_mask_type,
         cu_seqlens=None,
         max_seqlen=None,
@@ -499,7 +499,6 @@ def attention(
         return_softmax=False,
         inner_ring_size=None,
         pg_mesh=None,
-        sp_axis=None,
         **kwargs,
     ):
         """
@@ -546,7 +545,7 @@ def attention(
         assert pg_mesh is not None, f"Error: The pg mesh is None! please check the process group initialization."
 
         clone_pg = lambda pg: dist.new_group(dist.get_process_group_ranks(pg))
-
+        sp_group = pg_mesh.get_group_along_axis(sp_axis)
         if inner_ring_size != None:
             RingAttention.SP_GROUP = sp_group
             inner_ring_group, inter_ring_group = RingAttention.get_double_ring_groups(sp_axis, pg_mesh, inner_ring_size)
diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index 1eaed167ccf7..90ffba8cf061 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -857,19 +857,18 @@ def forward(
         dropout_p = self.attn_dropout.p if self.training else 0.0
 
         sp_mode = shard_config.sequence_parallelism_mode
-        sp_group = shard_config.sequence_parallel_process_group
+        shard_config.sequence_parallel_process_group
         if sp_mode == "ring_attn":
             attn_output = RingAttention.attention(
                 query,
                 key,
                 value,
-                sp_group,
+                sp_axis=shard_config.sp_axis,
                 **attention_mask,
                 dropout_p=dropout_p,
                 scale=scale,
                 inner_ring_size=shard_config.inner_ring_size,
                 pg_mesh=shard_config.pg_mesh,
-                sp_axis=shard_config.sp_axis,
             )
         else:
             attn_output = ColoAttention.attention(query, key, value, **attention_mask, dropout_p=dropout_p, scale=scale)
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index ca0751f0474a..2e6363060bb2 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -568,11 +568,10 @@ def forward(
                 query_states,
                 key_states,
                 value_states,
-                sp_group,
+                sp_axis=shard_config.sp_axis,
                 **attention_mask,
                 inner_ring_size=shard_config.inner_ring_size,
                 pg_mesh=shard_config.pg_mesh,
-                sp_axis=shard_config.sp_axis,
             )
 
         elif shard_config.enable_flash_attention:
diff --git a/tests/test_shardformer/test_layer/test_ring_attn.py b/tests/test_shardformer/test_layer/test_ring_attn.py
index 23e8e5b78c1d..6ebd8da73edf 100644
--- a/tests/test_shardformer/test_layer/test_ring_attn.py
+++ b/tests/test_shardformer/test_layer/test_ring_attn.py
@@ -44,12 +44,11 @@ def check_ring_attn(seq_len, bs, nheads, d, dtype, inner_ring_size):
         q,
         k,
         v,
-        sp_group,
+        sp_axis,
         AttnMaskType.CAUSAL,
         return_softmax=True,
         inner_ring_size=inner_ring_size,
         pg_mesh=pg_mesh,
-        sp_axis=sp_axis,
     )
     ring_out = ring_out.transpose(1, 2)
     out, lse, _ = flash_attn_qkvpacked_func(
@@ -88,6 +87,7 @@ def check_packed_seq(seqlen, bs, nheads, d, dtype):
     device = get_current_device()
     sp_group = dist.group.WORLD
     sp_size = dist.get_world_size()
+    sp_axis = 2
     atol = rtol = 7e-3
     torch.cuda.manual_seed(2)
     # Prepare varlen attention mask
@@ -128,11 +128,11 @@ def check_packed_seq(seqlen, bs, nheads, d, dtype):
         q_ring,
         k_ring,
         v_ring,
-        sp_group,
+        sp_axis,
         **mask_info,
         pad_output=False,
         return_softmax=True,
-        pg_mesh=dist.group.WORLD,
+        pg_mesh=ProcessGroupMesh(1, 1, sp_size, 1),
         # deterministic=True
     )
     ring_out = ring_out.transpose(1, 2).reshape(-1, nheads, d)

From bc7eeade33e33e3a7c2df26fedab707f3a62d6fe Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Tue, 15 Oct 2024 13:28:33 +0800
Subject: [PATCH 133/167] fix

---
 colossalai/shardformer/modeling/gpt2.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/colossalai/shardformer/modeling/gpt2.py b/colossalai/shardformer/modeling/gpt2.py
index 90ffba8cf061..d550484da2eb 100644
--- a/colossalai/shardformer/modeling/gpt2.py
+++ b/colossalai/shardformer/modeling/gpt2.py
@@ -857,7 +857,6 @@ def forward(
         dropout_p = self.attn_dropout.p if self.training else 0.0
 
         sp_mode = shard_config.sequence_parallelism_mode
-        shard_config.sequence_parallel_process_group
         if sp_mode == "ring_attn":
             attn_output = RingAttention.attention(
                 query,

From 9912cc8c07f66e9f5537d469428b1f06f890e29a Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Tue, 15 Oct 2024 06:26:01 +0000
Subject: [PATCH 134/167] [fix] fix bwd b; now bwd w only for Layer replaced by
 Linear1D_Col/Row; other layer perform a fully bwd;

---
 colossalai/pipeline/schedule/zero_bubble_pp.py        |  9 +++------
 colossalai/shardformer/layer/_operation.py            |  1 -
 colossalai/shardformer/layer/linear.py                |  1 -
 .../test_pipeline/test_schedule/test_zerobubble_pp.py | 11 ++++++++---
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/colossalai/pipeline/schedule/zero_bubble_pp.py b/colossalai/pipeline/schedule/zero_bubble_pp.py
index 089ca48eeb60..e155284bfc1b 100644
--- a/colossalai/pipeline/schedule/zero_bubble_pp.py
+++ b/colossalai/pipeline/schedule/zero_bubble_pp.py
@@ -509,12 +509,11 @@ def backward_b_step(
             optimizer.backward_by_grad(
                 tensor=output_obj_,
                 grad=output_obj_grad_,
-                inputs=input_obj_,
-                retain_graph=True,
+                # inputs=input_obj_,
+                # retain_graph=True,
             )
-
         # Format output_obj_grad
-        input_obj_grad = {}
+        input_obj_grad = dict()
         if model_chunk_id == 0 and self.stage_manager.is_first_stage(ignore_chunk=True):
             pass
         else:
@@ -714,7 +713,6 @@ def schedule_b(
         #     # we save output_tensor_grad here
         #     self.output_tensors_grad_dw[model_chunk_id].append(output_tensor_grad)
 
-        # Step2: bwd step
         input_object_grad = self.backward_b_step(
             model_chunk=model_chunk,
             model_chunk_id=model_chunk_id,
@@ -761,7 +759,6 @@ def schedule_w(
         # get y & dy from buffer
         # output_obj = self.output_tensors_dw[model_chunk_id].pop(0)
         # output_obj_grad = self.output_tensors_grad_dw[model_chunk_id].pop(0)
-
         WeightGradStore.pop(chunk=model_chunk_id)
 
         # self.backward_w_step(
diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index 9d3d91034c45..4a0800468ed7 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -177,7 +177,6 @@ def execute_w_pass(_input_, _grad_output_, _weight_main_grad_=None, wgrad_gemm_f
                 handle = dist.all_reduce(grad_input, group=ctx.process_group, async_op=True)
             # Relay on CUDA_DEVICE_MAX_CONNECTIONS=1 to have
             # all-reduce scheduled first and have GPU resources allocated, CUDA_DEVICE_MAX_CONNECTIONS=1 is set in shardformer.py
-
         if _grad_accum_fusion_available and weight.grad is not None:
             grad = weight.grad
             if use_zbv:
diff --git a/colossalai/shardformer/layer/linear.py b/colossalai/shardformer/layer/linear.py
index cb3ad0b45260..a8a3be63a1a9 100644
--- a/colossalai/shardformer/layer/linear.py
+++ b/colossalai/shardformer/layer/linear.py
@@ -230,7 +230,6 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
                 fp8_communication=self.fp8_communication,
                 use_zbv=self.use_zbv,
             )
-
         if self.gather_output:
             # All-gather across the partitions.
             output = gather_forward_split_backward(
diff --git a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
index fb59e0b2cc30..6286cc6f062a 100644
--- a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
+++ b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
@@ -753,8 +753,10 @@ def run_with_hybridplugin(test_config):
     "config",
     [
         # TODO:ERR in second iter
-        # (0, 1, 4, 1, 1),
-        # (1, 2, 2, 1, 1),
+        (0, 1, 4, 1, 1),
+        (1, 2, 2, 1, 1),
+        (1, 1, 2, 2, 1),
+        # Pass
         (1, 2, 1, 2, 1),
         (1, 2, 1, 1, 2),
     ],
@@ -891,19 +893,22 @@ def run_with_booster_moehybridplugin(config: Tuple[int, ...]):
 
         # ===================================================================================
         # run normal model with all dp(different) inputs
-        all_inputs = [torch.empty_like(input_embeddings) for _ in range(dp_size)]
+        all_inputs = [input_embeddings.clone() for _ in range(dp_size)]
         dist.all_gather(all_inputs, input_embeddings, group=plugin.dp_group)
         torch_output_sum = 0
         for input_data_ in all_inputs:
             torch_output = torch_model(inputs_embeds=input_data_.to(dtype)).last_hidden_state.mean()
             torch_output.backward()
             torch_output_sum += torch_output.detach()
+        # print(f"parallel_output {parallel_output} torch_output_sum {torch_output_sum}")
         # avg dp grads follows zero optimizer
         for p in torch_model.parameters():
             if p.grad is not None:
                 p.grad /= dp_size
         torch_optimizer.step()
         torch_optimizer.zero_grad()
+
+        # print(f"rank {dist.get_rank()} parallel_output {parallel_output} torch_output_sum {torch_output_sum}")
         assert_loose_close(parallel_output, torch_output_sum, dtype=dtype)
         print(f"rank {dist.get_rank()} config {test_config}  test passed")
     clear_layout_converter()

From 83cf2f84fb0c08a351a5affc71527556ff8912bc Mon Sep 17 00:00:00 2001
From: wangbluo <2538539015@qq.com>
Date: Tue, 15 Oct 2024 14:50:27 +0800
Subject: [PATCH 135/167] fix

---
 colossalai/shardformer/layer/attn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index 1a175f426c8f..bbd99d1620ce 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -470,14 +470,14 @@ def get_double_ring_groups(sp_axis, pg_mesh, inner_ring_size=None):
         # Create inner ring groups
         for i in range(inner_ring_size):
             ranks = list(range(i * inner_ring_size, (i + 1) * inner_ring_size))
-            group = pg_mesh.get_group_along_axis(2, ranks)
+            group = pg_mesh.get_group_along_axis(sp_axis, ranks)
             if sp_rank in ranks:
                 inner_ring_group = group
 
         # Create inter ring groups
         for i in range(num_rings):
             ranks = list(range(i, sp_size, num_rings))
-            group = pg_mesh.get_group_along_axis(2, ranks)
+            group = pg_mesh.get_group_along_axis(sp_axis, ranks)
             if sp_rank in ranks:
                 inter_ring_group = group
 

From 90939b77e040513575687e2368b94eb0bf9516a1 Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Tue, 15 Oct 2024 09:39:11 +0000
Subject: [PATCH 136/167] [fix] debug zbv llama test;

---
 .../test_schedule/test_zerobubble_pp.py       |  2 -
 .../test_model/test_shard_llama.py            | 53 ++++++++++---------
 2 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
index a56a68cd397f..1a1fbbeb2de8 100644
--- a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
+++ b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
@@ -756,11 +756,9 @@ def run_with_hybridplugin(test_config):
 @parameterize(
     "config",
     [
-        # TODO:ERR in second iter
         (0, 1, 4, 1, 1),
         (1, 2, 2, 1, 1),
         (1, 1, 2, 2, 1),
-        # Pass
         (1, 2, 1, 2, 1),
         (1, 2, 1, 1, 2),
     ],
diff --git a/tests/test_shardformer/test_model/test_shard_llama.py b/tests/test_shardformer/test_model/test_shard_llama.py
index 04ef78221d34..ce513f1fdbd4 100644
--- a/tests/test_shardformer/test_model/test_shard_llama.py
+++ b/tests/test_shardformer/test_model/test_shard_llama.py
@@ -277,32 +277,33 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "precision": "fp16",
             "initial_scale": 1,
         },
-        {
-            "tp_size": 2,
-            "pp_size": 2,
-            "pp_style": "zbv",
-            "num_model_chunks": 2,
-            "num_microbatches": 4,
-            "enable_all_optimization": False,
-            "precision": "fp16",
-            "zero_stage": 0,
-            "initial_scale": 1,
-            "enable_gradient_checkpointing": True,
-            "parallel_output": False,
-        },
-        {
-            "tp_size": 2,
-            "pp_size": 2,
-            "pp_style": "zbv",
-            "num_model_chunks": 2,
-            "num_microbatches": 4,
-            "enable_all_optimization": False,
-            "precision": "fp16",
-            "zero_stage": 1,
-            "initial_scale": 1,
-            "enable_gradient_checkpointing": True,
-            "parallel_output": False,
-        },
+        # TODO: assert layer error
+        # {
+        #     "tp_size": 2,
+        #     "pp_size": 2,
+        #     "pp_style": "zbv",
+        #     "num_model_chunks": 2,
+        #     "num_microbatches": 4,
+        #     "enable_all_optimization": False,
+        #     "precision": "fp16",
+        #     "zero_stage": 0,
+        #     "initial_scale": 1,
+        #     "enable_gradient_checkpointing": True,
+        #     "parallel_output": False,
+        # },
+        # {
+        #     "tp_size": 2,
+        #     "pp_size": 2,
+        #     "pp_style": "zbv",
+        #     "num_model_chunks": 2,
+        #     "num_microbatches": 4,
+        #     "enable_all_optimization": False,
+        #     "precision": "fp16",
+        #     "zero_stage": 1,
+        #     "initial_scale": 1,
+        #     "enable_gradient_checkpointing": True,
+        #     "parallel_output": False,
+        # },
     ],
 )
 def run_llama_test(test_config):

From 62c13e7969f2bf48661aa5c7d0251ca846c6d462 Mon Sep 17 00:00:00 2001
From: Wenxuan Tan <wenxuan.tan@wisc.edu>
Date: Tue, 15 Oct 2024 22:23:35 -0500
Subject: [PATCH 137/167] [Ring Attention] Improve comments  (#6085)

* improve comments

* improve comments

---------

Co-authored-by: Edenzzzz <wtan45@wisc.edu>
---
 colossalai/shardformer/layer/attn.py  | 99 +++++++++++++++++----------
 colossalai/shardformer/layer/utils.py | 32 +++++----
 2 files changed, 80 insertions(+), 51 deletions(-)

diff --git a/colossalai/shardformer/layer/attn.py b/colossalai/shardformer/layer/attn.py
index bbd99d1620ce..3202ebf25813 100644
--- a/colossalai/shardformer/layer/attn.py
+++ b/colossalai/shardformer/layer/attn.py
@@ -422,13 +422,18 @@ class RingAttention(torch.autograd.Function):
     ATTN_DONE: torch.cuda.Event = None
     SP_STREAM: torch.cuda.Stream = None
     SP_GROUP: dist.ProcessGroup = None
-    # duplicate process group for concurrent NCCL streams
-    # while both PyTorch and NCCL warns(https://github.com/pytorch/pytorch/commit/2dbe5cb979f674f0052a8eea1f7b6c3c0ba441d7)
-    # against this, in practice it seems to work fine.
+
+    # NOTE: Duplicating PGs for concurrent NCCL streams is a risky hack -- while it may increase throughput,
+    # both PyTorch and NCCL warn against this. (https://github.com/pytorch/pytorch/commit/2dbe5cb979f674f0052a8eea1f7b6c3c0ba441d7)
+    # LoongTrain's original double ring impl. uses concurrent PGs
+    # (https://github.com/InternLM/InternEvo/blob/e52f2ffc9acf818e8f2b1f97dfc69ceb2f06e154/internlm/model/ops/ring_flash_attn/zigzag_ring_flash_attn_with_sliding_window.py#L192)
+    # but I confirmed with Pytorch developers this can cause obscure "Software caused connection abort" errors.
+    # (https://github.com/pytorch/pytorch/issues/132852)
+    # NOTE: In general, a smarter idea is put as many P2P calls as possible into one `batch_isend_irecv`.
     INNER_RING_GROUP: dist.ProcessGroup = None
-    INNER_RING_GROUP_COPY: dist.ProcessGroup = None
+    # INNER_RING_GROUP_COPY: dist.ProcessGroup = None
     INTER_RING_GROUP: dist.ProcessGroup = None
-    INTER_RING_GROUP_COPY: dist.ProcessGroup = None
+    # INTER_RING_GROUP_COPY: dist.ProcessGroup = None
 
     @staticmethod
     def get_double_ring_groups(sp_axis, pg_mesh, inner_ring_size=None):
@@ -626,7 +631,13 @@ def forward(
         inner_ring_group: Optional[dist.ProcessGroup] = None,
         inter_ring_group: Optional[dist.ProcessGroup] = None,
     ):
-
+        """
+        Forward supporting both packed (varlen) and batched(fixed length, no padding) sequences.
+        No separate version for batched seq (hard to maintain), which incurs
+        some overhead in sequence splitting due to python for loops.
+        Uses two CUDA streams to overlap softmax denominator correction with next flash attn
+        (see comments below).
+        """
         cu_seqlens_q = cu_seqlens_kv = cu_seqlens
         max_seqlen_q = max_seqlen_kv = max_seqlen
         cu_seqlens_half = cu_seqlens // 2
@@ -668,7 +679,8 @@ def forward(
 
         sp_size = dist.get_world_size(sp_group)
         sp_rank = dist.get_rank(sp_group)
-        # Attempt to achieve concurrent comm in the two-stream forward
+
+        # Create communicators corresponding to two CUDA streams
         local_kv_comms = [RingComm(inner_ring_group) for _ in range(2)]
         inter_ring_comm = RingComm(inter_ring_group)
         local_sp_size = dist.get_world_size(inner_ring_group)
@@ -676,7 +688,7 @@ def forward(
         inter_ring_rank = dist.get_rank(inter_ring_group) if inter_ring_group is not sp_group else 0
         num_rings = dist.get_world_size(inter_ring_group) if inter_ring_group is not sp_group else 1
 
-        # Non-contiguous indexing copies to a new contiguous tensor,
+        # Any type of indexing(but not slicing) copies to a new contiguous tensor,
         # so only do it once
         if sp_rank != sp_size - 1:
             q1 = q[half_idx_back]
@@ -693,6 +705,7 @@ def forward(
         rng_states = [None for _ in range(sp_size)]
         sp_streams = [torch.cuda.current_stream(), sp_stream]
 
+        # Helper to pass args to FA
         def _forward(q, k, v, causal):
             (
                 _,
@@ -723,6 +736,7 @@ def _kv_comm(i):
             if i < local_sp_size - 1:
                 local_kv_comms[i % 2].send_recv(kv_buffers[i % 2], kv_buffers[(i + 1) % 2])
 
+        # Forward within a node
         def _local_ring_forward():
             # (Hopefully) overlap output correction with next flash attn
             for i in range(local_sp_size):
@@ -731,6 +745,8 @@ def _local_ring_forward():
                     # NOTE: waiting outside the current stream will NOT correctly synchronize.
                     if i > 0:
                         local_kv_comms[(i + 1) % 2].wait()
+
+                    # Prefetch
                     if i == 0:
                         _kv_comm(i)
 
@@ -764,15 +780,22 @@ def _local_ring_forward():
                         ) = _forward(q_block, kv_block[0], kv_block[1], causal=False)
                     RingAttention.ATTN_DONE.record()
                     # Pipeline the next KV comm with output correction instead of the next flash attn
-                    # to minimize idle time when comm takes longer than attn.
+                    # kernel, to minimize bubble when comm takes longer than attn.
                     _kv_comm(i + 1)
 
                     block_softmax_lse[i % 2] = (
                         block_softmax_lse[i % 2].transpose(0, 1).unsqueeze(-1).contiguous().float()
                     )  # (H, T) -> (T, H, 1)
                     assert block_out[i % 2].shape[:-1] == block_softmax_lse[i % 2].shape[:-1]
-                    # Output and log sum exp correction. Ideally overlap this with the next flash attn kernel.
-                    # In reality this always finishes before next flash attn; no need for extra sync.
+
+                    # Output and log sum exp correction.
+                    # Ideally overlap this with the next flash attn kernel,
+                    # since attn uses Tensor Core and rescale is element-wise, memory-bound and uses CUDA cores.
+                    # (NOTE that this is the same as ping-pong scheduling idea in FA3)
+                    # TODO However sometimes while the GPU has scheduled the next kernel,
+                    # it's reluctant to launch it in overlap. Some potential causes:
+                    # 1. need lower-level CUDA scheduling 2. further benchmark against Megatron-LM
+                    # 3. register spilling by FA kernel.
                     if i == 0:
                         out = block_out[0]
                         softmax_lse = block_softmax_lse[0]
@@ -788,15 +811,17 @@ def _local_ring_forward():
             torch.cuda.current_stream().wait_stream(sp_stream)
             return out, softmax_lse
 
+        # Forward for inter-node (the outer ring in 2D ring)
         def _other_ring_forward(ring_num_idx, out, softmax_lse):
             # Loop through the inner ring after receiving
-            # all new KVs from the previous inner ring
+            # all new KVs from another ring
             for i in range(local_sp_size):
                 with torch.cuda.stream(sp_streams[i % 2]):
                     # Send & recv KV
                     if i > 0:
                         local_kv_comms[(i + 1) % 2].wait()
 
+                    # Prefetch
                     if i == 0:
                         _kv_comm(i)
 
@@ -893,7 +918,8 @@ def _other_ring_forward(ring_num_idx, out, softmax_lse):
     def backward(ctx, dout, _):
         """
         During backward, we accumulate q grads on each rank locally, but iterate kv and their grads
-        over all ranks for accumulation.
+        over all ranks for accumulation. We avoid using two streams due to backward using doubled
+        buffers and more comm cost.
         """
         (q, k, v, out, softmax_lse, cu_seqlens_q, cu_seqlens_kv, half_idx_front, half_idx_back) = ctx.saved_tensors[:9]
         rng_states = ctx.saved_tensors[9:]
@@ -925,7 +951,7 @@ def backward(ctx, dout, _):
         local_sp_rank = dist.get_rank(sp_group)
         sp_size = dist.get_world_size(sp_group)
 
-        # Using separate streams (pg) for concurrent kv and dkv comm may
+        # NOTE: Using separate streams (PG) for concurrent kv and dkv comm may
         # cause NCCL "software caused connection abort" here...
         local_kv_comm = RingComm(local_kv_group)
         local_dkv_comm = RingComm(local_kv_group)
@@ -957,6 +983,7 @@ def backward(ctx, dout, _):
         dkv_buffers = [torch.empty_like(kv, dtype=torch.float32) for kv in kv_buffers]  # (T, H, D)
         del k, v
 
+        # Helper to pass args to FA
         def _backward(dout, q, k, v, out, softmax_lse, dq, dk, dv, rng_state, causal):
             _flash_attn_backward(
                 dout,
@@ -977,8 +1004,7 @@ def _backward(dout, q, k, v, out, softmax_lse, dq, dk, dv, rng_state, causal):
                 **misc_kwargs,
             )
 
-        # NOTE: We avoid using two streams due to doubled buffers
-        # and that backward is more communication intensive.
+        # Backward within a node
         def _local_ring_backward():
             for i in range(local_sp_size):
                 if i > 0:
@@ -1041,6 +1067,7 @@ def _local_ring_backward():
             dkv_send = dkv_buffers[(local_sp_size - 1) % 2]
             return dq, dkv_recv, dkv_send
 
+        # Backward for inter-node (the outer ring in 2D ring)
         def _other_ring_backward(ring_num_idx, dq):
             if ring_num_idx > inter_ring_rank:
                 # Indexing is expensive
@@ -1125,34 +1152,34 @@ def _other_ring_backward(ring_num_idx, dq):
 
     @staticmethod
     def prepare_varlen_batch(
-        attention_mask: torch.Tensor,
+        padding_mask: torch.Tensor,
         sp_group: dist.ProcessGroup,
         inputs_embeds: torch.Tensor = None,
         position_ids: Optional[torch.Tensor] = None,
         is_label: bool = False,
-        is_2d: bool = True,
+        is_batched_seq: bool = True,
     ):
+        # TODO: support setting a batch dim (fix packing length) for packed mode, so that
+        # DP can be used (needs to modify dataloader too)
         """
         Preprocess a batch of padded sequence by splitting input sequence by sp_size
-        sequence-wise and packing them into one sequence. Updates the mask info accordingly.
+        seq-wise and packing them into one sequence. Updates the mask info accordingly.
         Args:
-            attention_mask (torch.Tensor): Contains the mask [B, Sq], where True means the token is NOT masked.
+            padding_mask (torch.Tensor): Contains the mask [B, Sq], where True means the token is NOT masked.
             sp_group (dist.ProcessGroup): Process group for sequence parallelism
             inputs_embeds (torch.Tensor): Input embeddings. Shape should be [B, Sq, ...]
             position_ids (Optional[torch.Tensor], optional): Position ids of shape [Sq] or [1, Sq]. Defaults to None.
             is_label (bool, optional): Whether inputs_embeds is instead a label tensor. If True, mask out the first
                 token of each sequence.
-            is_2d (bool, optional): Whether to return 2D outputs padded to max_seqlen // sp_size or flatten
-                the batch dim to a packed 1d sequence. Contingent on model forward shape definitions.
+            is_batched_seq (bool, optional): If True, then the input is a batch of (potentially padded) sequences
+                of shape [B, Sq, ...]; else a packed sequence of shape [T, ...].
 
         Returns:
-            torch.Tensor:
-                Packed input embeddings of shape [B, Sq // sp_size, ...].
-
-            Dict[str, Any]:
+            inputs_embeds (torch.Tensor):
+                Packed input embeddings of shape [B, Sq // sp_size, ...] if is_batched_seq, else [T, ...].
+            mask_info (Dict[str, Any]):
                 A dictionary containing mask info.
-
-            torch.Tensor:
+            position_ids (torch.Tensor):
                 Packed position ids of shape [..., Sq // sp_size].
 
         """
@@ -1160,12 +1187,11 @@ def prepare_varlen_batch(
         sp_size = dist.get_world_size(group=sp_group)
         sp_rank = dist.get_rank(group=sp_group)
         mask_info = {}
-        mask_info["max_seqlen"], mask_info["cu_seqlens"] = get_pad_info(attention_mask, return_indices=False)
+        mask_info["max_seqlen"], mask_info["cu_seqlens"] = get_pad_info(padding_mask, return_indices=False)
 
-        # Unpad, split seq-wise, then pad back to (B, max_seqlen // sp_size)
-        # Split mask to compute local nonzero position indices
+        # Unpad, split seq-wise, then pad to (B, max_seqlen // sp_size)
         # (B, Sq) -> (B, max_seqlen // sp_size)
-        attention_mask = attention_mask[:, : mask_info["max_seqlen"]]
+        padding_mask = padding_mask[:, : mask_info["max_seqlen"]]
         if inputs_embeds is not None:
             inputs_embeds = inputs_embeds[:, : mask_info["max_seqlen"]]
             inputs_embeds = split_varlen_zigzag(
@@ -1173,11 +1199,12 @@ def prepare_varlen_batch(
                 mask_info["cu_seqlens"],
                 sp_group,
                 mask_info["max_seqlen"],
-                is_2d=is_2d,
+                is_batched_seq=is_batched_seq,
                 is_label=is_label,
             )
-        attention_mask = split_varlen_zigzag(
-            attention_mask, mask_info["cu_seqlens"], sp_group, mask_info["max_seqlen"], is_2d=is_2d
+        # Split mask to get local nonzero seq positions
+        padding_mask = split_varlen_zigzag(
+            padding_mask, mask_info["cu_seqlens"], sp_group, mask_info["max_seqlen"], is_batched_seq=is_batched_seq
         )
 
         if position_ids is not None:
@@ -1190,7 +1217,7 @@ def prepare_varlen_batch(
             )
 
         mask_info["max_seqlen"] //= sp_size
-        mask_info["valid_indices"] = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+        mask_info["valid_indices"] = torch.nonzero(padding_mask.flatten(), as_tuple=False).flatten()
         mask_info["cu_seqlens"] //= sp_size
         mask_info["attention_mask_type"] = AttnMaskType.PADDED_CAUSAL
         return inputs_embeds, mask_info, position_ids
diff --git a/colossalai/shardformer/layer/utils.py b/colossalai/shardformer/layer/utils.py
index 4512e0c680f3..2df68e18c64d 100644
--- a/colossalai/shardformer/layer/utils.py
+++ b/colossalai/shardformer/layer/utils.py
@@ -295,8 +295,8 @@ def split_batch_zigzag(
     batch: Union[torch.Tensor, List[torch.Tensor]], sp_group: ProcessGroup, seq_dim: int = 1, is_label: bool = False
 ) -> Union[torch.Tensor, List[torch.Tensor]]:
     """
-    Split the input along the sequence dimension for Ring Attention. Naively spliting the attention mask
-    in the causal setting will result in the preceding ranks having much less workload.
+    Split the input sequence batch . Naively spliting the attention mask in the causal setting
+    will result in the preceding ranks having much less workload.
     We split after "folding" the 2D attention mask in half (https://github.com/zhuzilin/ring-flash-attention/issues/2).
     For example, for sp_size = 4 and seq_len = 8, we get | s0, s7 | s1, s6 | s2, s5 | s3, s4 |.
 
@@ -346,40 +346,42 @@ def split_varlen_zigzag(
     cu_seqlens: torch.Tensor,
     sp_group: ProcessGroup,
     max_seqlen: int = 0,
-    is_2d: bool = False,
+    is_batched_seq: bool = False,
     is_label: bool = False,
 ) -> Union[List[torch.Tensor], torch.Tensor]:
-    """Split each sequence in a batch of packed sequences in a zigzag fashion.
-        For each tensor in batch, return packed sequences if is_2d is False;
-        else return a padded batch of sequences.
-
+    """Split a packed seq/batch of padded sequences in a Zigzag fashion.
+        Different from split_batch_zigzag, inputs here have variable sequence lengths.
     Args:
-        batch (List[torch.Tensor]): Packed sequences of shape (B * Sq, ...), or (B, Sq, ...) if is_2d.
+        batch (List[torch.Tensor]): Packed sequences of shape (T, ...), or (B, Sq, ...) if is_batched_seq,
+            where T is the total number of tokens.
         cu_seqlens (torch.Tensor): Cumulative sequence lengths of shape (B + 1) before splitting.
         sp_group (ProcessGroup): The process group for sequence parallelism.
         max_seqlen (int): The maximum sequence length in the batch before splitting.
-        is_2d (bool): If True, then input has batch size and sequence length split into two dimensions.
+        is_batched_seq (bool): If True, then the input is a batch of sequences padded to the same len.
         is_label (bool): If True, mask out the first token in each sequence (<Start of Sentence>).
 
     Returns:
-        batch (List[torch.Tensor]): Packed sequences of shape (B * max_seqlen // sp_size)
-            or (B, max_seqlen // sp_size, ...) if is_2d
+        batch (List[torch.Tensor]): Packed sequences of shape (T, ..)
+            or (B, max_seqlen // sp_size, ...) if is_batched_seq
     """
     sp_size = dist.get_world_size(sp_group)
     sp_rank = dist.get_rank(sp_group)
     if sp_size == 1:
         return batch
 
-    if is_2d:
+    if is_batched_seq:
         assert max_seqlen > 0, "max_seqlen must be provided for 2D input"
 
     if isinstance(batch, torch.Tensor):
         batch = [batch]
+    # seq: (B, Sq, h, n)
+    # seq = seq[:, :rank * (seqlen // sp_size), ...]
+
     for i, packed_seq in enumerate(batch):
         device = packed_seq.device
         dtype = packed_seq.dtype
 
-        if is_2d:
+        if is_batched_seq:
             assert max_seqlen % (sp_size * 2) == 0
             # Recreate a padded tensor with the new max seqlen
             shape = (packed_seq.shape[0], max_seqlen // sp_size, *packed_seq.shape[2:])
@@ -398,7 +400,7 @@ def split_varlen_zigzag(
                 seqlen % (2 * sp_size) == 0
             ), f"batch {i} seq {j}'s length ({seqlen}) must be divisible by 2 * sp_size = {2 * sp_size} for splitting"
 
-            if is_2d:
+            if is_batched_seq:
                 seq = packed_seq[j][:seqlen]
                 if is_label:
                     # Shift one position to the right for next token prediction
@@ -415,7 +417,7 @@ def split_varlen_zigzag(
                 seq = seq.chunk(sp_size * 2)
                 local_seq.extend([seq[sp_rank], seq[2 * sp_size - 1 - sp_rank]])
 
-        if is_2d:
+        if is_batched_seq:
             batch[i] = local_seq.contiguous()
         else:
             batch[i] = torch.cat(local_seq, dim=0)

From e76308c6e65cb73cc5b20936bd232ba7390c6b11 Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Wed, 16 Oct 2024 03:25:04 +0000
Subject: [PATCH 138/167] [fix] rm use_zbv flag in Shardconfig; rm debug info;

---
 .../booster/plugin/hybrid_parallel_plugin.py  |   1 -
 .../plugin/moe_hybrid_parallel_plugin.py      |   1 -
 colossalai/shardformer/policies/llama.py      |  24 +-
 colossalai/shardformer/policies/mixtral.py    |  28 +-
 colossalai/shardformer/shard/shard_config.py  |   1 -
 examples/language/llama/benchmark.py          |   2 -
 .../test_schedule/test_zerobubble_pp.py       | 176 ++++-
 tests/test_pipeline/test_schedule/zbv_poc.py  | 628 ------------------
 .../test_model/test_shard_llama.py            |   2 +-
 9 files changed, 212 insertions(+), 651 deletions(-)
 delete mode 100644 tests/test_pipeline/test_schedule/zbv_poc.py

diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index bba943f12810..caeed5457c44 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -1201,7 +1201,6 @@ def __init__(
             gradient_checkpoint_config=gradient_checkpoint_config,
             fp8_communication=fp8_communication,
             inner_ring_size=inner_ring_size,
-            use_zbv=(pp_style == "zbv"),
         )
         self.amp_config = dict(
             initial_scale=initial_scale,
diff --git a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
index b7e65c6a2f78..8b62a1e2bd8c 100644
--- a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
@@ -373,7 +373,6 @@ def __init__(
             make_vocab_size_divisible_by=make_vocab_size_divisible_by,
             gradient_checkpoint_config=gradient_checkpoint_config,
             fp8_communication=fp8_communication,
-            use_zbv=(pp_style == "zbv"),
         )
         self.amp_config = dict(
             initial_scale=initial_scale,
diff --git a/colossalai/shardformer/policies/llama.py b/colossalai/shardformer/policies/llama.py
index 5c68d0c5ebca..db4515d7ea65 100644
--- a/colossalai/shardformer/policies/llama.py
+++ b/colossalai/shardformer/policies/llama.py
@@ -60,6 +60,11 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
         else:
             norm_cls = RMSNorm
 
+        if self.pipeline_stage_manager:
+            use_zbv = self.pipeline_stage_manager.use_zbv
+        else:
+            use_zbv = False
+
         sp_mode = self.shard_config.sequence_parallelism_mode or None
         sp_size = self.shard_config.sequence_parallel_size or None
         sp_group = self.shard_config.sequence_parallel_process_group or None
@@ -129,7 +134,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                         kwargs=dict(
                             seq_parallel_mode=sp_mode,
                             fp8_communication=self.shard_config.fp8_communication,
-                            use_zbv=self.shard_config.use_zbv,
+                            use_zbv=use_zbv,
                         ),
                     ),
                     SubModuleReplacementDescription(
@@ -138,7 +143,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                         kwargs=dict(
                             seq_parallel_mode=sp_mode,
                             fp8_communication=self.shard_config.fp8_communication,
-                            use_zbv=self.shard_config.use_zbv,
+                            use_zbv=use_zbv,
                         ),
                     ),
                     SubModuleReplacementDescription(
@@ -147,7 +152,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                         kwargs=dict(
                             seq_parallel_mode=sp_mode,
                             fp8_communication=self.shard_config.fp8_communication,
-                            use_zbv=self.shard_config.use_zbv,
+                            use_zbv=use_zbv,
                         ),
                     ),
                     SubModuleReplacementDescription(
@@ -156,7 +161,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                         kwargs=dict(
                             seq_parallel_mode=sp_mode,
                             fp8_communication=self.shard_config.fp8_communication,
-                            use_zbv=self.shard_config.use_zbv,
+                            use_zbv=use_zbv,
                         ),
                     ),
                     SubModuleReplacementDescription(
@@ -165,7 +170,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                         kwargs=dict(
                             seq_parallel_mode=sp_mode,
                             fp8_communication=self.shard_config.fp8_communication,
-                            use_zbv=self.shard_config.use_zbv,
+                            use_zbv=use_zbv,
                         ),
                     ),
                     SubModuleReplacementDescription(
@@ -174,7 +179,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                         kwargs=dict(
                             seq_parallel_mode=sp_mode,
                             fp8_communication=self.shard_config.fp8_communication,
-                            use_zbv=self.shard_config.use_zbv,
+                            use_zbv=use_zbv,
                         ),
                     ),
                     SubModuleReplacementDescription(
@@ -183,7 +188,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                         kwargs=dict(
                             seq_parallel_mode=sp_mode,
                             fp8_communication=self.shard_config.fp8_communication,
-                            use_zbv=self.shard_config.use_zbv,
+                            use_zbv=use_zbv,
                         ),
                     ),
                 ],
@@ -413,6 +418,10 @@ def module_policy(self):
         from transformers import LlamaForSequenceClassification
 
         policy = super().module_policy()
+        if self.pipeline_stage_manager:
+            use_zbv = self.pipeline_stage_manager.use_zbv
+        else:
+            use_zbv = False
 
         if self.shard_config.enable_tensor_parallelism:
             # add a new item for sequence classification
@@ -425,6 +434,7 @@ def module_policy(self):
                             kwargs=dict(
                                 gather_output=True,
                                 fp8_communication=self.shard_config.fp8_communication,
+                                use_zbv=use_zbv,
                             ),
                         )
                     ]
diff --git a/colossalai/shardformer/policies/mixtral.py b/colossalai/shardformer/policies/mixtral.py
index de546b3c5119..11291169a442 100644
--- a/colossalai/shardformer/policies/mixtral.py
+++ b/colossalai/shardformer/policies/mixtral.py
@@ -52,6 +52,10 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
         sp_group = self.shard_config.sequence_parallel_process_group or None
         sp_partial_derived = sp_mode in ["split_gather", "ring"]
         tp_size = self.shard_config.tensor_parallel_size
+        if self.pipeline_stage_manager:
+            use_zbv = self.pipeline_stage_manager.use_zbv
+        else:
+            use_zbv = False
 
         # modified for both SP and TP
         num_q_heads = self.model.config.num_attention_heads
@@ -126,7 +130,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                         target_module=Linear1D_Col,
                         kwargs={
                             "fp8_communication": self.shard_config.fp8_communication,
-                            "use_zbv": self.shard_config.use_zbv,
+                            "use_zbv": use_zbv,
                         },
                     ),
                     SubModuleReplacementDescription(
@@ -134,7 +138,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                         target_module=Linear1D_Col,
                         kwargs={
                             "fp8_communication": self.shard_config.fp8_communication,
-                            "use_zbv": self.shard_config.use_zbv,
+                            "use_zbv": use_zbv,
                         },
                     ),
                     SubModuleReplacementDescription(
@@ -142,7 +146,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                         target_module=Linear1D_Col,
                         kwargs={
                             "fp8_communication": self.shard_config.fp8_communication,
-                            "use_zbv": self.shard_config.use_zbv,
+                            "use_zbv": use_zbv,
                         },
                     ),
                     SubModuleReplacementDescription(
@@ -150,7 +154,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                         target_module=Linear1D_Row,
                         kwargs={
                             "fp8_communication": self.shard_config.fp8_communication,
-                            "use_zbv": self.shard_config.use_zbv,
+                            "use_zbv": use_zbv,
                         },
                     ),
                     SubModuleReplacementDescription(
@@ -159,7 +163,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                         kwargs={
                             "gather_output": True,
                             "fp8_communication": self.shard_config.fp8_communication,
-                            "use_zbv": self.shard_config.use_zbv,
+                            "use_zbv": use_zbv,
                         },
                     ),
                 ],
@@ -195,7 +199,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                             "tp_group": self.shard_config.tensor_parallel_process_group,
                             "moe_dp_group": self.shard_config.moe_dp_group,
                             "fp8_communication": self.shard_config.fp8_communication,
-                            "use_zbv": self.shard_config.use_zbv,
+                            "use_zbv": use_zbv,
                         },
                     )
                 ],
@@ -330,6 +334,10 @@ def get_shared_params(self) -> List[Dict[int, Tensor]]:
 class MixtralForCausalLMPolicy(MixtralPolicy):
     def module_policy(self):
         policy = super().module_policy()
+        if self.pipeline_stage_manager:
+            use_zbv = self.pipeline_stage_manager.use_zbv
+        else:
+            use_zbv = False
         # TODO: assign pg mesh from plugin to all modules
         if self.shard_config.enable_tensor_parallelism:
             # add a new item for causal lm
@@ -342,7 +350,7 @@ def module_policy(self):
                             kwargs=dict(
                                 gather_output=True,
                                 fp8_communication=self.shard_config.fp8_communication,
-                                use_zbv=self.shard_config.use_zbv,
+                                use_zbv=use_zbv,
                             ),
                         )
                     ],
@@ -392,6 +400,10 @@ def module_policy(self):
         from transformers import MixtralForSequenceClassification
 
         policy = super().module_policy()
+        if self.pipeline_stage_manager:
+            use_zbv = self.pipeline_stage_manager.use_zbv
+        else:
+            use_zbv = False
 
         if self.shard_config.enable_tensor_parallelism:
             # add a new item for sequence classification
@@ -404,7 +416,7 @@ def module_policy(self):
                             kwargs=dict(
                                 gather_output=True,
                                 fp8_communication=self.shard_config.fp8_communication,
-                                use_zbv=self.shard_config.use_zbv,
+                                use_zbv=use_zbv,
                             ),
                         )
                     ]
diff --git a/colossalai/shardformer/shard/shard_config.py b/colossalai/shardformer/shard/shard_config.py
index 33e93fa515b7..1219119bb095 100644
--- a/colossalai/shardformer/shard/shard_config.py
+++ b/colossalai/shardformer/shard/shard_config.py
@@ -49,7 +49,6 @@ class ShardConfig:
     make_vocab_size_divisible_by: int = 64
     gradient_checkpoint_config: Optional[GradientCheckpointConfig] = None
     extra_kwargs: Dict[str, Any] = field(default_factory=dict)
-    use_zbv: bool = False
 
     # For ring attention
     inner_ring_size: Optional[int] = None
diff --git a/examples/language/llama/benchmark.py b/examples/language/llama/benchmark.py
index 4f2c45d75ba8..041c51fb19fb 100644
--- a/examples/language/llama/benchmark.py
+++ b/examples/language/llama/benchmark.py
@@ -5,8 +5,6 @@
 from contextlib import nullcontext
 
 import torch
-
-torch.autograd.set_detect_anomaly(True)
 import torch.distributed as dist
 from data_utils import RandomDataset
 from model_utils import format_numel_str, get_model_numel
diff --git a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
index 1a1fbbeb2de8..bdc539043944 100644
--- a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
+++ b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
@@ -8,12 +8,14 @@
 import torch.distributed as dist
 import torch.nn as nn
 from torch.testing import assert_close
+from transformers.models.llama.configuration_llama import LlamaConfig
+from transformers.models.llama.modeling_llama import LlamaModel
 from transformers.models.mixtral.configuration_mixtral import MixtralConfig
 from transformers.models.mixtral.modeling_mixtral import MixtralModel
 
 import colossalai
 from colossalai.booster.booster import Booster
-from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
+from colossalai.booster.plugin.moe_hybrid_parallel_plugin import HybridParallelPlugin, MoeHybridParallelPlugin
 from colossalai.cluster import ProcessGroupMesh
 from colossalai.interface import OptimizerWrapper
 from colossalai.logging import disable_existing_loggers
@@ -918,11 +920,181 @@ def run_with_booster_moehybridplugin(config: Tuple[int, ...]):
     torch.cuda.empty_cache()
 
 
+@parameterize(
+    "config",
+    [
+        (0, 4, 1, 1),
+        # (1, 2, 2, 1),
+        # (1, 2, 1, 2),
+        # (1, 1, 2, 2),
+    ],
+)
+def run_with_booster_hybridplugin(config: Tuple[int, ...]):
+    stage, pp_size, tp_size, sp_size = config
+    num_microbatches = pp_size
+    dist.get_world_size()
+    rank = dist.get_rank()
+    dtype, precision = torch.float16, "fp16"
+    torch.cuda.set_device(dist.get_rank())
+
+    ########
+    # init base model
+    ########
+    assert pp_size <= NUM_LAYERS, "pp_size should be less than or equal to NUM_LAYERS"
+    config = LlamaConfig(
+        hidden_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS,
+        intermediate_size=HIDDEN_SIZE_PER_HEAD * NUM_HEADS * 2,
+        num_hidden_layers=NUM_LAYERS,
+        num_attention_heads=NUM_HEADS,
+        num_key_value_heads=NUM_HEADS,
+        attn_implementation="flash_attention_2",
+    )
+
+    # init model with the same seed
+    seed_all(10086)
+
+    torch_model = LlamaModel(config).to(dtype).cuda()
+    # TODO: Support MixtralForCausalLM
+    # torch_model = MixtralForCausalLM(config).to(dtype).cuda()
+    torch_optimizer = torch.optim.SGD(torch_model.parameters(), lr=1)
+    # init schedule
+    h, a, s = config.hidden_size, config.num_attention_heads, 1024
+    mem_f = 34 * h + 5 * a * s
+    mem_w = -32 * h
+    mem_b = -mem_w - mem_f
+    graph = PipelineGraph(
+        n_stage=pp_size,
+        n_micro=num_microbatches,
+        f_cost=1,
+        b_cost=1,
+        w_cost=1,
+        c_cost=1,
+        f_mem=mem_f,
+        b_mem=mem_b,
+        w_mem=mem_w,
+    )
+
+    zbv_schedule = graph.get_v_schedule()
+
+    # init MoeHybridPlugin
+    plugin = HybridParallelPlugin(
+        pp_size=pp_size,
+        num_microbatches=pp_size,
+        tp_size=tp_size,
+        sp_size=sp_size,
+        zero_stage=stage,
+        enable_sequence_parallelism=sp_size > 1,
+        sequence_parallelism_mode="all_to_all" if sp_size > 1 else None,
+        overlap_communication=False,
+        initial_scale=1,
+        precision=precision,
+        find_unused_parameters=True,
+        pp_style="zbv",
+        scheduler_nodes=zbv_schedule,
+        num_model_chunks=2,
+    )
+
+    dp_size = plugin.dp_size
+
+    booster = Booster(plugin=plugin)
+
+    ########
+    # init pp model
+    ########
+
+    parallel_model = deepcopy(torch_model)
+    parallel_optimizer = torch.optim.SGD(parallel_model.parameters(), lr=1)
+    parallel_model, parallel_optimizer, _, _, _ = booster.boost(parallel_model, parallel_optimizer)
+    # create different input along dp axis
+    seed_all(1453 + rank)
+
+    torch_model.train()
+    parallel_model.train()
+    for i in range(2):
+        # gen random input
+        # input = torch.rand(
+        #     NUM_BATCH, NUM_TOK_PER_BATCH, NUM_HEADS, HIDDEN_SIZE_PER_HEAD, requires_grad=True
+        # ).cuda()
+        input_ids = torch.randint(0, torch_model.vocab_size, (NUM_BATCH, config.max_position_embeddings)).cuda()
+        attention_mask = torch.ones_like(input_ids).cuda()
+        input_ids.clone().cuda()
+        input_data = {"input_ids": input_ids, "attention_mask": attention_mask}
+
+        # dist.all_reduce(
+        #     input, group=plugin.pp_group
+        # )  # pp inputs except the first stage doesn't matter, but need to be replicate for torch model check
+
+        # dist.all_reduce(input, group=plugin.tp_group)  # tp group duplicate input
+        # dist.all_reduce(input, group=plugin.sp_group)  # sp group duplicate input
+
+        # run the model with hybrid parallel
+        if booster.plugin.stage_manager is not None:
+            # for test with pp
+            data_iter = iter([input_data])
+            sharded_output = booster.execute_pipeline(
+                data_iter,
+                parallel_model,
+                lambda x, y: x.last_hidden_state.mean(),
+                parallel_optimizer,
+                return_loss=True,
+                return_outputs=True,
+            )
+            # stage 0 chunk 0
+            parallel_output = None
+            if (
+                booster.plugin.stage_manager.is_first_stage(ignore_chunk=True)
+                and rank == dist.get_process_group_ranks(plugin.pp_group)[0]
+            ):
+                parallel_output = sharded_output["loss"]
+            else:
+                parallel_output = torch.tensor(12345.0, device="cuda")
+            # broadcast along pp axis
+            dist.broadcast(parallel_output, src=dist.get_process_group_ranks(plugin.pp_group)[0], group=plugin.pp_group)
+
+        else:
+            # for test without pp
+            parallel_output = parallel_model(
+                input_ids=input_data["input_ids"],
+                attention_mask=input_data["attention_mask"],
+            ).last_hidden_state.mean()
+            parallel_optimizer.backward(parallel_output)
+        parallel_optimizer.step()
+        parallel_optimizer.zero_grad()
+        dist.all_reduce(parallel_output, group=plugin.dp_group)
+
+        # ===================================================================================
+        # run normal model with all dp(different) inputs
+        all_inputs = [input_data for _ in range(dp_size)]
+        # dist.all_gather(all_inputs, input, group=plugin.dp_group)
+        torch_output_sum = 0
+        for input_data_ in all_inputs:
+            torch_output = torch_model(
+                input_ids=input_data_["input_ids"],
+                attention_mask=input_data_["attention_mask"],
+            ).last_hidden_state.mean()
+            torch_output.backward()
+            torch_output_sum += torch_output.detach()
+        # print(f"parallel_output {parallel_output} torch_output_sum {torch_output_sum}")
+        # avg dp grads follows zero optimizer
+        for p in torch_model.parameters():
+            if p.grad is not None:
+                p.grad /= dp_size
+        torch_optimizer.step()
+        torch_optimizer.zero_grad()
+
+        print(f"loop {i} rank {dist.get_rank()} parallel_output {parallel_output} torch_output_sum {torch_output_sum}")
+        # assert_loose_close(parallel_output, torch_output_sum, dtype=dtype)
+        # print(f"rank {dist.get_rank()} config {test_config}  test passed")
+    clear_layout_converter()
+    Randomizer.reset_index()
+    torch.cuda.empty_cache()
+
+
 def run_dist(rank, world_size, port):
     disable_existing_loggers()
     colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-    # run_fwd_bwd_vschedule_with_optim()
     run_with_booster_moehybridplugin()
+    # run_with_booster_hybridplugin()
 
 
 @pytest.mark.dist
diff --git a/tests/test_pipeline/test_schedule/zbv_poc.py b/tests/test_pipeline/test_schedule/zbv_poc.py
deleted file mode 100644
index 6280990a9edd..000000000000
--- a/tests/test_pipeline/test_schedule/zbv_poc.py
+++ /dev/null
@@ -1,628 +0,0 @@
-import gc
-import time
-from copy import deepcopy
-
-import torch
-import torch.nn as nn
-from torch.testing import assert_close
-
-
-def get_model_numel(model):
-    return sum(p.numel() for p in model.parameters()) / 1024**2
-
-
-# Step1: dx = w*dy
-def backward_b(loss, x, model):
-    torch.autograd.backward(loss, inputs=x, retain_graph=True)
-
-
-# Step2: dummy dw = x*dy
-def backward_w(loss, model):
-    torch.autograd.backward(loss, inputs=list(model.parameters()))
-
-
-def test_double_dx_dw_split_nsync():
-    device = "cuda:0"
-    model = nn.Linear(4096, 4096, bias=None).to(device=device)
-    # print(f"model numel {get_model_numel(model)}") # 4GB
-    x1 = torch.rand(4096, 4096).to(device=device)
-    x2 = torch.rand(4096, 4096).to(device=device)
-    ref_model = deepcopy(model)
-    ref_x1 = x1.clone()
-    ref_x2 = x1.clone()
-
-    # first step
-    x1.requires_grad_()
-    x2.requires_grad_()
-    ref_x1.requires_grad_()
-    ref_x2.requires_grad_()
-
-    # loss for dx_dw bwd
-    loss1 = model(x1).sum()
-    loss2 = model(x2).sum()
-
-    # loss for common bwd
-    ref_loss1 = ref_model(ref_x1).sum()
-    ref_loss2 = ref_model(ref_x2).sum()
-
-    # dx1
-    torch.cuda.synchronize()
-    bwd_b_start_time = time.time()
-    backward_b(loss1, x1, model)
-    bwd_b_end_time = time.time()
-    print(f"loss_1 bwd B runtime {bwd_b_end_time - bwd_b_start_time}")
-
-    for p in model.parameters():
-        assert p.grad is None
-    assert x1.grad is not None
-
-    # dx2
-    torch.cuda.synchronize()
-    bwd_b_start_time = time.time()
-    backward_b(loss2, x2, model)
-    bwd_b_end_time = time.time()
-    print(f"loss_2 bwd B runtime {bwd_b_end_time - bwd_b_start_time}")
-
-    # dw1
-    torch.cuda.synchronize()
-    bwd_w_start_time = time.time()
-    backward_w(loss1, model)
-    bwd_w_end_time = time.time()
-    print(f"loss_1 bwd W runtime {bwd_w_end_time - bwd_w_start_time}")
-    for p in model.parameters():
-        assert p.grad is not None
-
-    # common bwd 1
-    torch.cuda.synchronize()
-    comm_bwd_start_time = time.time()
-    ref_loss1.backward()
-    comm_bwd_end_time = time.time()
-    print(f"loss_1 comm bwd runtime {comm_bwd_end_time - comm_bwd_start_time}")
-
-    # # assert dx1 & dw1 == bwd 1
-    # assert_close(x1.grad, ref_x1.grad)
-    # for p1, p2 in zip(model.parameters(), ref_model.parameters()):
-    #     assert_close(p1, p2)
-    #     assert_close(p1.grad, p2.grad)
-
-    # dw2
-    torch.cuda.synchronize()
-    bwd_w_start_time = time.time()
-    backward_w(loss2, model)
-    bwd_w_end_time = time.time()
-    print(f"loss_2 bwd W runtime {bwd_w_end_time - bwd_w_start_time}")
-
-    # common bwd 2
-    torch.cuda.synchronize()
-    comm_bwd_start_time = time.time()
-    ref_loss2.backward()
-    comm_bwd_end_time = time.time()
-    print(f"loss_2 comm bwd runtime {comm_bwd_end_time - comm_bwd_start_time}")
-
-    # # assert dx2 & dw2 == bwd 2
-    # assert_close(x2.grad, ref_x2.grad)
-    # for p1, p2 in zip(model.parameters(), ref_model.parameters()):
-    #     print(f"bwd2:\n p1 {p1.grad},\n p2 {p2.grad}\n")
-    #     assert_close(p1, p2)
-    #     assert_close(p1.grad, p2.grad)
-
-
-def test_double_dx_dw_split_sync():
-    device = "cuda:0"
-    model = nn.Linear(8, 8, bias=None).to(device=device)
-    print(f"model size {get_model_numel(model)} ")  # 4GB
-    x1 = torch.rand(8, 8).to(device=device)
-    x2 = torch.rand(8, 8).to(device=device)
-
-    # x1 = torch.ones(8, 8).to(device=device)
-    # x2 = torch.ones(8, 8).to(device=device)
-
-    ref_model = deepcopy(model)
-    ref_x1 = x1.clone()
-    ref_x2 = x2.clone()
-
-    x1.requires_grad_()
-    x2.requires_grad_()
-    ref_x1.requires_grad_()
-    ref_x2.requires_grad_()
-
-    ############
-    # step1:
-    ############
-
-    # loss1
-    loss1 = model(x1).sum()
-
-    # ref_loss1
-    ref_model(ref_x1).sum()
-
-    # dx1
-    backward_b(loss1, x1, model)
-    for p in model.parameters():
-        assert p.grad is None
-    assert x1.grad is not None
-
-    # dw1
-    backward_w(loss1, model)
-    for p in model.parameters():
-        assert p.grad is not None
-
-    # common bwd 1
-    # ref_loss1.backward()
-
-    # assert dx1 & dw1 == bwd 1
-    assert_close(x1.grad, ref_x1.grad)
-    for p1, p2 in zip(model.parameters(), ref_model.parameters()):
-        assert_close(p1, p2)
-        assert_close(p1.grad, p2.grad)
-
-    ############
-    # step2:
-    ############
-
-    # loss2
-    loss2 = model(x2).sum()
-
-    # ref_loss2
-    ref_loss2 = ref_model(ref_x2).sum()
-
-    for p1, p2 in zip(model.parameters(), ref_model.parameters()):
-        print(f"bwd2:\n p1 {p1.grad},\n p2 {p2.grad}\n")
-        assert_close(p1, p2)
-        assert_close(p1.grad, p2.grad)
-
-    # dx2
-    backward_b(loss2, x2, model)
-
-    # dw2
-    backward_w(loss2, model)
-
-    # common bwd 2
-    ref_loss2.backward()
-
-    # assert dx2 & dw2 == bwd 2
-    assert_close(x2.grad, ref_x2.grad)
-    for p1, p2 in zip(model.parameters(), ref_model.parameters()):
-        print(f"bwd2:\n p1 {p1.grad},\n p2 {p2.grad}\n")
-        assert_close(p1, p2)
-        assert_close(p1.grad, p2.grad)
-
-
-def deallocate_output_tensor(out):
-    """Pseudo-deallocate (i.e., set to scalar) the output tensor's '.data' field.
-
-    This method should be called right after the output tensor has been
-    sent to the next pipeline stage. At this point, the output tensor is
-    only useful for its '.grad_fn' field, and not its '.data'.
-    """
-    assert isinstance(out, torch.Tensor), "expected Tensor, found %s." % type(out).__name__
-    assert out._base is None, "counter-productive to free a view of another tensor."
-    out.data = torch.empty(
-        (1,),
-        device=out.device,
-        dtype=out.dtype,
-    )
-
-
-IN_DIM = 8192
-OUT_DIM = 8192
-NUM_LAYER = 3
-
-
-class MlpModel(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.layers = nn.ModuleList([nn.Linear(IN_DIM, OUT_DIM, bias=None) for _ in range(NUM_LAYER)])
-
-    def forward(self, x):
-        for layer in self.layers:
-            x = layer(x)
-        return x
-
-
-class Attention(nn.Module):
-    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.0, proj_drop=0.0, with_qkv=True):
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim**-0.5
-        self.with_qkv = with_qkv
-        if self.with_qkv:
-            self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
-            self.proj = nn.Linear(dim, dim)
-            self.proj_drop = nn.Dropout(proj_drop)
-        self.attn_drop = nn.Dropout(attn_drop)
-
-    def forward(self, x):
-        B, N, C = x.shape
-        if self.with_qkv:
-            qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
-            q, k, v = qkv[0], qkv[1], qkv[2]
-        else:
-            qkv = x.reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
-            q, k, v = qkv, qkv, qkv
-
-        attn = (q @ k.transpose(-2, -1)) * self.scale
-        attn = attn.softmax(dim=-1)
-        attn = self.attn_drop(attn)
-
-        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
-        if self.with_qkv:
-            x = self.proj(x)
-            x = self.proj_drop(x)
-        return x
-
-
-def mem_dx_dw():
-    device = "cuda:0"
-    # model = nn.Linear(IN_DIM, OUT_DIM, bias=None).to(device=device)
-    print(f"Before init Model: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
-    model = MlpModel().to(device=device)
-    print(f"model numel {get_model_numel(model)}")  # 4GB
-    print(f"After init Model: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
-
-    print(f"Before init x1&2&3: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
-    x1 = torch.rand(IN_DIM, OUT_DIM).to(device=device)
-    x2 = torch.rand(IN_DIM, OUT_DIM).to(device=device)
-    x3 = torch.rand(IN_DIM, OUT_DIM).to(device=device)
-
-    x1.requires_grad_()
-    x2.requires_grad_()
-    x3.requires_grad_()
-    print(f"After init x1&2&3: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
-
-    ############
-    # step1:
-    ############
-    print(f"\nStep1")
-
-    # loss1
-    print(f"Before Fwd x1: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
-    y1 = model(x1)
-    print(f"After Fwd x1: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
-
-    print(f"Before loss1: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
-    loss1 = y1.sum()
-    print(f"After loss1: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
-
-    # dx1
-    backward_b(loss1, x1, model)
-
-    # dw1
-    backward_w(loss1, model)
-
-    deallocate_output_tensor(x1)
-    deallocate_output_tensor(y1)
-    # del x1
-    # del y1
-    print(f"After del x1&y1: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
-
-    # print(f"\n Step1:collect:{gc.collect()}")
-    # print(f"object: {gc.get_objects()}")
-    # print(f"garbage: {gc.garbage}")
-
-    ############
-    # step2:
-    ############
-    print(f"\nStep2")
-
-    # loss2
-    y2 = model(x2)
-    loss2 = y2.sum()
-
-    # dx2
-    backward_b(loss2, x2, model)
-
-    # dw2
-    backward_w(loss2, model)
-    deallocate_output_tensor(x2)
-    deallocate_output_tensor(y2)
-    # del x2
-    # del y2
-    print(f"After del x2&y2: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
-
-    print(f"\n Step2:collect:{gc.collect()}")
-    # print(f"object: {gc.get_objects()}")
-    print(f"garbage: {gc.garbage}")
-
-    ############
-    # step3:
-    ############
-
-    print(f"\nStep3")
-
-    # loss3
-    y3 = model(x3)
-    loss3 = y3.sum()
-
-    # dx2
-    backward_b(loss3, x3, model)
-
-    # dw2
-    backward_w(loss3, model)
-
-    deallocate_output_tensor(x3)
-    deallocate_output_tensor(y3)
-    # del x3
-    # del y3
-
-    print(f"After del x3&y3: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
-
-    print(f"\n Step3:collect:{gc.collect()}")
-    # print(f"object: {gc.get_objects()}")
-    print(f"garbage: {gc.garbage}")
-
-
-# del activation
-def activation_dx_dw():
-    device = "cuda:0"
-    # model = nn.Linear(IN_DIM, OUT_DIM, bias=None).to(device=device)
-    print(f"Before init Model: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
-    model = MlpModel().to(device=device)
-    x1 = torch.rand(IN_DIM, OUT_DIM).to(device=device)
-    x2 = torch.rand(IN_DIM, OUT_DIM).to(device=device)
-    x3 = torch.rand(IN_DIM, OUT_DIM).to(device=device)
-
-    x1.requires_grad_()
-    x2.requires_grad_()
-    x3.requires_grad_()
-    print(f"After init Model, x1,x2,x3: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
-
-    activations = {}
-
-    def register_hooks(module):
-        def activation_hook(module, input, output):
-            activations[f"{module.__class__.__name__}_{id(module)}"] = output.detach()
-
-        def bwd_hook(module, grad_input, grad_output):
-            del activations[f"{module.__class__.__name__}_{id(module)}"]
-
-        module.register_forward_hook(activation_hook)
-        module.register_backward_hook(bwd_hook)
-
-    model.apply(register_hooks)
-
-    ############
-    # step1:
-    ############
-    print(f"\nStep1")
-
-    # loss1
-    loss1 = model(x1).sum()
-
-    # dx1
-    backward_b(loss1, x1, model)
-
-    # dw1
-    backward_w(loss1, model)
-
-    del loss1, x1
-    print(f"After del x1&y1: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
-
-    ############
-    # step2:
-    ############
-    print(f"\nStep2")
-
-    # loss2
-    loss2 = model(x2).sum()
-
-    # dx2
-    backward_b(loss2, x2, model)
-
-    # dw2
-    backward_w(loss2, model)
-
-    # deallocate_output_tensor(x2)
-    # deallocate_output_tensor(loss2)
-    del x2, loss2
-    print(f"After del x2&y2: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
-
-    ############
-    # step3:
-    ############
-    print(f"\nStep3")
-
-    # loss3
-    loss3 = model(x3).sum()
-
-    # dx2
-    backward_b(loss3, x3, model)
-
-    # dw2
-    backward_w(loss3, model)
-
-    del x3, loss3
-
-    print(f"After del x3&y3: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
-
-
-# text dx dw in model chunk
-def model_chunk_dx_dw():
-    device = "cuda:0"
-    num_layers = 4
-    print(f"Before init Model: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
-    model = MlpModel(in_dim=4096, out_dim=4096, num_layers=num_layers).to(device=device)
-    x = torch.rand(4096, 4096).to(device=device)
-    x.requires_grad_()
-
-    model_chunk_0 = torch.nn.ModuleList()  # for layer 1 & 2
-    model_chunk_1 = torch.nn.ModuleList()  # for layer 3 & 4
-
-    for idx, sub_model in enumerate(model.layers):
-        if idx < 2:
-            model_chunk_0.append(sub_model).cuda()
-        else:
-            model_chunk_1.append(sub_model).cuda()
-
-    print(f"After init Model & input: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
-
-    # Step1:chunk 0 fwd
-    activation = dict()  # layer_id: activation
-    out = x
-    for i in range(len(model_chunk_0)):
-        layer = model_chunk_0[i]
-        activation[i] = layer(out)
-    print(f"After chunk0 fwd: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
-    # Step2:chunk 1 fwd
-    for i in range(len(model_chunk_1)):
-        layer = model_chunk_0[i]
-        activation[i + 2] = layer(out)
-    print(f"After chunk1 fwd: {torch.cuda.memory_allocated()/1024**3 :.3f} GB;")
-
-    # Step3:chunk 1 bwd b: dx=w*dy & bwd w:dw=x*dy
-    # visit layer reversely
-    for i in range(len(model_chunk_1) - 1, -1, -1):
-        layer = model_chunk_1[i]
-        global_layer_idx = i + 2
-        prev_global_layer_idx = i + 1 if i + 1 > 0 else None
-        i + 3 if i + 3 < 4 else None
-
-        # bwd b
-        if global_layer_idx == num_layers - 1:  # last layer in last chunk; calculate loss
-            loss = activation[global_layer_idx].sum()
-            x = activation[prev_global_layer_idx]
-            backward_b(loss, x, layer)
-        else:
-            loss = activation[global_layer_idx].sum()
-            x = activation[prev_global_layer_idx]
-            backward_b(loss, x, layer)
-
-        # bwd w
-        backward_w(loss, layer)
-
-
-def test_dx_dw_linear_benchmark():
-    device = "cuda:0"
-    model = nn.Linear(4096, 4096, bias=None).to(device=device)
-    # print(f"model numel {get_model_numel(model)}") # 4GB
-    x1 = torch.rand(4096, 4096).to(device=device)
-    # x2 = torch.rand(4096, 4096).to(device=device)
-    ref_model = deepcopy(model)
-    ref_x1 = x1.clone()
-    # ref_x2 = x1.clone()
-
-    # first step
-    x1.requires_grad_()
-    # x2.requires_grad_()
-    ref_x1.requires_grad_()
-    # ref_x2.requires_grad_()
-
-    # loss for dx_dw bwd
-    loss1 = model(x1).sum()
-    # loss2 = model(x2).sum()
-
-    # loss for common bwd
-    ref_model(ref_x1).sum()
-    # ref_loss2 = ref_model(ref_x2).sum()
-
-    with torch.profiler.profile(
-        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
-        # schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
-        on_trace_ready=torch.profiler.tensorboard_trace_handler(
-            f"/home/nvme-share/home/duanjunwen/ColossalAI/tests/test_pipeline/test_schedule"
-        ),
-        record_shapes=True,
-        profile_memory=True,
-        with_stack=True,
-        with_flops=True,
-    ) as prof:
-        # dx1
-        torch.cuda.synchronize()
-        bwd_b_start_time = time.time()
-        backward_b(loss1, x1, model)
-        bwd_b_end_time = time.time()
-        print(f"loss_1 bwd B runtime {bwd_b_end_time - bwd_b_start_time}")
-
-        for p in model.parameters():
-            assert p.grad is None
-        assert x1.grad is not None
-
-        # dw1
-        torch.cuda.synchronize()
-        bwd_w_start_time = time.time()
-        backward_w(loss1, model)
-        bwd_w_end_time = time.time()
-        print(f"loss_1 bwd W runtime {bwd_w_end_time - bwd_w_start_time}")
-        for p in model.parameters():
-            assert p.grad is not None
-
-        # # common bwd 1
-        # torch.cuda.synchronize()
-        # comm_bwd_start_time = time.time()
-        # ref_loss1.backward()
-        # comm_bwd_end_time = time.time()
-        # print(f"loss_1 comm bwd runtime {comm_bwd_end_time - comm_bwd_start_time}")
-
-
-def test_dx_dw_attn_benchmark():
-    device = "cuda:0"
-    model = Attention(dim=4096).to(device=device)
-    # print(f"model numel {get_model_numel(model)}") # 4GB
-    x1 = torch.rand(1, 256, 4096).to(device=device)
-    # x2 = torch.rand(1, 256, 4096).to(device=device)
-    ref_model = deepcopy(model)
-    ref_x1 = x1.clone()
-    # ref_x2 = x1.clone()
-
-    # first step
-    x1.requires_grad_()
-    # x2.requires_grad_()
-    ref_x1.requires_grad_()
-    # ref_x2.requires_grad_()
-
-    # loss for dx_dw bwd
-    loss1 = model(x1).sum()
-    # loss2 = model(x2).sum()
-
-    # loss for common bwd
-    ref_model(ref_x1).sum()
-    # ref_loss2 = ref_model(ref_x2).sum()
-
-    with torch.profiler.profile(
-        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
-        # schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
-        on_trace_ready=torch.profiler.tensorboard_trace_handler(
-            f"/home/nvme-share/home/duanjunwen/ColossalAI/tests/test_pipeline/test_schedule"
-        ),
-        record_shapes=True,
-        profile_memory=True,
-        with_stack=True,
-        with_flops=True,
-    ) as prof:
-        # dx1
-        torch.cuda.synchronize()
-        bwd_b_start_time = time.time()
-        backward_b(loss1, x1, model)
-        bwd_b_end_time = time.time()
-        print(f"loss_1 bwd B runtime {bwd_b_end_time - bwd_b_start_time}")
-
-        for p in model.parameters():
-            assert p.grad is None
-        assert x1.grad is not None
-
-        # dw1
-        torch.cuda.synchronize()
-        bwd_w_start_time = time.time()
-        backward_w(loss1, model)
-        bwd_w_end_time = time.time()
-        print(f"loss_1 bwd W runtime {bwd_w_end_time - bwd_w_start_time}")
-        for p in model.parameters():
-            assert p.grad is not None
-
-        # # common bwd 1
-        # torch.cuda.synchronize()
-        # comm_bwd_start_time = time.time()
-        # ref_loss1.backward()
-        # comm_bwd_end_time = time.time()
-        # print(f"loss_1 comm bwd runtime {comm_bwd_end_time - comm_bwd_start_time}")
-
-
-if __name__ == "__main__":
-    # test_dx_dw_split()
-    # test_double_dx_dw_split_nsync()
-    # test_double_dx_dw_split_sync()
-    # mem_dx_dw()
-    # activation_dx_dw()
-    # test_dx_dw_linear_benchmark()
-    test_dx_dw_attn_benchmark()
diff --git a/tests/test_shardformer/test_model/test_shard_llama.py b/tests/test_shardformer/test_model/test_shard_llama.py
index ce513f1fdbd4..33707a4f6921 100644
--- a/tests/test_shardformer/test_model/test_shard_llama.py
+++ b/tests/test_shardformer/test_model/test_shard_llama.py
@@ -277,7 +277,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
             "precision": "fp16",
             "initial_scale": 1,
         },
-        # TODO: assert layer error
+        # # TODO: assert layer error
         # {
         #     "tp_size": 2,
         #     "pp_size": 2,

From 705b18e1e7b4910ccf1ec24b725300e64bbcaf73 Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Wed, 16 Oct 2024 03:58:50 +0000
Subject: [PATCH 139/167] [fix] add & fix  llama test

---
 colossalai/shardformer/modeling/llama.py      |  2 +-
 .../test_schedule/test_zerobubble_pp.py       | 53 ++++++++-----------
 2 files changed, 22 insertions(+), 33 deletions(-)

diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index 47c17e7494f2..7a04c5451cfc 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -82,7 +82,7 @@ def llama_model_forward(
             elif input_ids is not None:
                 batch_size, seq_length = input_ids.shape[:2]
             elif inputs_embeds is not None:
-                batch_size, seq_length, _ = inputs_embeds.shape[:2]
+                batch_size, seq_length = inputs_embeds.shape[:2]
             else:
                 raise ValueError("You have to specify either input_ids or inputs_embeds")
             if inputs_embeds is None:
diff --git a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
index bdc539043944..ffeaf6bd8b19 100644
--- a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
+++ b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
@@ -924,9 +924,9 @@ def run_with_booster_moehybridplugin(config: Tuple[int, ...]):
     "config",
     [
         (0, 4, 1, 1),
-        # (1, 2, 2, 1),
-        # (1, 2, 1, 2),
-        # (1, 1, 2, 2),
+        (1, 2, 2, 1),
+        (1, 2, 1, 2),
+        (1, 1, 2, 2),
     ],
 )
 def run_with_booster_hybridplugin(config: Tuple[int, ...]):
@@ -1010,27 +1010,22 @@ def run_with_booster_hybridplugin(config: Tuple[int, ...]):
 
     torch_model.train()
     parallel_model.train()
-    for i in range(2):
+    for _ in range(2):
         # gen random input
-        # input = torch.rand(
-        #     NUM_BATCH, NUM_TOK_PER_BATCH, NUM_HEADS, HIDDEN_SIZE_PER_HEAD, requires_grad=True
-        # ).cuda()
-        input_ids = torch.randint(0, torch_model.vocab_size, (NUM_BATCH, config.max_position_embeddings)).cuda()
-        attention_mask = torch.ones_like(input_ids).cuda()
-        input_ids.clone().cuda()
-        input_data = {"input_ids": input_ids, "attention_mask": attention_mask}
-
-        # dist.all_reduce(
-        #     input, group=plugin.pp_group
-        # )  # pp inputs except the first stage doesn't matter, but need to be replicate for torch model check
+        input_embeddings = torch.rand(
+            NUM_BATCH, NUM_TOK_PER_BATCH, HIDDEN_SIZE_PER_HEAD * NUM_HEADS, requires_grad=True
+        ).cuda()
+        dist.all_reduce(
+            input_embeddings, group=plugin.pp_group
+        )  # pp inputs except the first stage doesn't matter, but need to be replicate for torch model check
 
-        # dist.all_reduce(input, group=plugin.tp_group)  # tp group duplicate input
-        # dist.all_reduce(input, group=plugin.sp_group)  # sp group duplicate input
+        dist.all_reduce(input_embeddings, group=plugin.tp_group)  # tp group duplicate input
+        dist.all_reduce(input_embeddings, group=plugin.sp_group)  # sp group duplicate input
 
         # run the model with hybrid parallel
         if booster.plugin.stage_manager is not None:
             # for test with pp
-            data_iter = iter([input_data])
+            data_iter = iter([{"inputs_embeds": input_embeddings}])
             sharded_output = booster.execute_pipeline(
                 data_iter,
                 parallel_model,
@@ -1053,10 +1048,7 @@ def run_with_booster_hybridplugin(config: Tuple[int, ...]):
 
         else:
             # for test without pp
-            parallel_output = parallel_model(
-                input_ids=input_data["input_ids"],
-                attention_mask=input_data["attention_mask"],
-            ).last_hidden_state.mean()
+            parallel_output = parallel_model(inputs_embeds=input_embeddings.to(dtype)).last_hidden_state.mean()
             parallel_optimizer.backward(parallel_output)
         parallel_optimizer.step()
         parallel_optimizer.zero_grad()
@@ -1064,14 +1056,11 @@ def run_with_booster_hybridplugin(config: Tuple[int, ...]):
 
         # ===================================================================================
         # run normal model with all dp(different) inputs
-        all_inputs = [input_data for _ in range(dp_size)]
-        # dist.all_gather(all_inputs, input, group=plugin.dp_group)
+        all_inputs = [input_embeddings.clone() for _ in range(dp_size)]
+        dist.all_gather(all_inputs, input_embeddings, group=plugin.dp_group)
         torch_output_sum = 0
         for input_data_ in all_inputs:
-            torch_output = torch_model(
-                input_ids=input_data_["input_ids"],
-                attention_mask=input_data_["attention_mask"],
-            ).last_hidden_state.mean()
+            torch_output = torch_model(inputs_embeds=input_data_.to(dtype)).last_hidden_state.mean()
             torch_output.backward()
             torch_output_sum += torch_output.detach()
         # print(f"parallel_output {parallel_output} torch_output_sum {torch_output_sum}")
@@ -1082,9 +1071,9 @@ def run_with_booster_hybridplugin(config: Tuple[int, ...]):
         torch_optimizer.step()
         torch_optimizer.zero_grad()
 
-        print(f"loop {i} rank {dist.get_rank()} parallel_output {parallel_output} torch_output_sum {torch_output_sum}")
-        # assert_loose_close(parallel_output, torch_output_sum, dtype=dtype)
-        # print(f"rank {dist.get_rank()} config {test_config}  test passed")
+        # print(f"rank {dist.get_rank()} parallel_output {parallel_output} torch_output_sum {torch_output_sum}")
+        assert_loose_close(parallel_output, torch_output_sum, dtype=dtype)
+        print(f"rank {dist.get_rank()} pp_size:{pp_size}, tp_size {tp_size}, sp_size :{sp_size} test passed")
     clear_layout_converter()
     Randomizer.reset_index()
     torch.cuda.empty_cache()
@@ -1094,7 +1083,7 @@ def run_dist(rank, world_size, port):
     disable_existing_loggers()
     colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
     run_with_booster_moehybridplugin()
-    # run_with_booster_hybridplugin()
+    run_with_booster_hybridplugin()
 
 
 @pytest.mark.dist

From cd61353bae02dd79d6063f7c5f954566d4f54291 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Wed, 16 Oct 2024 17:27:33 +0800
Subject: [PATCH 140/167] [pipeline] hotfix backward for multiple outputs
 (#6090)

* [pipeline] hotfix backward for multiple outputs

* [pipeline] hotfix backward for multiple outputs
---
 colossalai/pipeline/schedule/interleaved_pp.py | 17 +++++++++--------
 colossalai/pipeline/schedule/one_f_one_b.py    | 17 +++++++++--------
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/colossalai/pipeline/schedule/interleaved_pp.py b/colossalai/pipeline/schedule/interleaved_pp.py
index c538ee0715b4..5da98364dc85 100644
--- a/colossalai/pipeline/schedule/interleaved_pp.py
+++ b/colossalai/pipeline/schedule/interleaved_pp.py
@@ -351,15 +351,16 @@ def backward_step(
         if output_obj_grad is None:
             optimizer.backward(output_obj)
         else:
-            if "backward_tensor_keys" not in output_obj:
-                for k, grad in output_obj_grad.items():
-                    optimizer.backward_by_grad(output_obj[k], grad)
+            keys = output_obj.get("backward_tensor_keys", output_obj_grad.keys())
+            tensors_to_backward = []
+            grads_to_backward = []
+            for k in keys:
+                tensors_to_backward.append(output_obj[k])
+                grads_to_backward.append(output_obj_grad[k])
+            if len(tensors_to_backward) == 1:
+                optimizer.backward_by_grad(tensors_to_backward[0], grads_to_backward[0])
             else:
-                for k, grad in output_obj_grad.items():
-                    output_obj[k].grad = grad
-                for k in output_obj["backward_tensor_keys"]:
-                    tensor_to_backward = output_obj[k]
-                    optimizer.backward_by_grad(tensor_to_backward, tensor_to_backward.grad)
+                optimizer.backward_by_grad(tensors_to_backward, grads_to_backward)
 
         # Collect the grad of the input_obj.
         input_obj_grad = None
diff --git a/colossalai/pipeline/schedule/one_f_one_b.py b/colossalai/pipeline/schedule/one_f_one_b.py
index 0fc90995adcc..224d63688b16 100644
--- a/colossalai/pipeline/schedule/one_f_one_b.py
+++ b/colossalai/pipeline/schedule/one_f_one_b.py
@@ -305,15 +305,16 @@ def backward_step(
         if output_obj_grad is None:
             optimizer.backward(output_obj)
         else:
-            if "backward_tensor_keys" not in output_obj:
-                for k, grad in output_obj_grad.items():
-                    optimizer.backward_by_grad(output_obj[k], grad)
+            keys = output_obj.get("backward_tensor_keys", output_obj_grad.keys())
+            tensors_to_backward = []
+            grads_to_backward = []
+            for k in keys:
+                tensors_to_backward.append(output_obj[k])
+                grads_to_backward.append(output_obj_grad[k])
+            if len(tensors_to_backward) == 1:
+                optimizer.backward_by_grad(tensors_to_backward[0], grads_to_backward[0])
             else:
-                for k, grad in output_obj_grad.items():
-                    output_obj[k].grad = grad
-                for k in output_obj["backward_tensor_keys"]:
-                    tensor_to_backward = output_obj[k]
-                    optimizer.backward_by_grad(tensor_to_backward, tensor_to_backward.grad)
+                optimizer.backward_by_grad(tensors_to_backward, grads_to_backward)
 
         # Collect the grad of the input_obj.
         input_obj_grad = None

From 2bcd0b6844d8610cf0965d954f0f35a5e950a001 Mon Sep 17 00:00:00 2001
From: botbw <wang1570@e.ntu.edu.sg>
Date: Mon, 14 Oct 2024 07:32:16 +0000
Subject: [PATCH 141/167] [ckpt] add safetensors util

---
 colossalai/utils/safetensors.py | 49 +++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)
 create mode 100644 colossalai/utils/safetensors.py

diff --git a/colossalai/utils/safetensors.py b/colossalai/utils/safetensors.py
new file mode 100644
index 000000000000..4e295cdfc111
--- /dev/null
+++ b/colossalai/utils/safetensors.py
@@ -0,0 +1,49 @@
+# a python safetensors serializer modified from https://github.com/huggingface/safetensors/blob/41bd1acf38ad28ac559522d40596c6c802f79453/safetensors/src/tensor.rs#L214
+import json
+from dataclasses import asdict, dataclass
+from typing import Dict, List, Tuple
+
+import torch
+from safetensors.torch import _TYPES
+
+_TYPES_INV = {v: k for k, v in _TYPES.items()}
+
+
+@dataclass
+class TensorInfo:
+    dtype: str
+    shape: List[int]
+    data_offsets: Tuple[int, int]
+
+
+@dataclass
+class PreparedData:
+    n: int
+    header_bytes: bytes
+    offset: int
+
+
+def prepare(data: Dict[str, torch.Tensor]) -> Tuple[PreparedData, List[torch.Tensor]]:
+    sorted_data = sorted(data.items(), key=lambda x: (x[1].dtype, x[0]))
+
+    tensors = []
+    metadata = {}
+    offset = 0
+
+    for name, tensor in sorted_data:
+        n = tensor.numel() * tensor.element_size()
+        tensor_info = TensorInfo(
+            dtype=_TYPES_INV[tensor.dtype], shape=list(tensor.shape), data_offsets=(offset, offset + n)
+        )
+        offset += n
+        metadata[name] = asdict(tensor_info)
+        tensors.append(tensor)
+
+    metadata_buf = json.dumps(metadata).encode("utf-8")
+
+    extra = (8 - len(metadata_buf) % 8) % 8
+    metadata_buf += b" " * extra
+
+    n = len(metadata_buf)
+
+    return PreparedData(n=n, header_bytes=metadata_buf, offset=offset), tensors

From 3b1d7d1ae88683a27e45fba984050c970370b0e6 Mon Sep 17 00:00:00 2001
From: botbw <wang1570@e.ntu.edu.sg>
Date: Mon, 14 Oct 2024 09:41:25 +0000
Subject: [PATCH 142/167] [chore] refactor

---
 colossalai/utils/safetensors.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/colossalai/utils/safetensors.py b/colossalai/utils/safetensors.py
index 4e295cdfc111..9aa3558d9926 100644
--- a/colossalai/utils/safetensors.py
+++ b/colossalai/utils/safetensors.py
@@ -6,6 +6,10 @@
 import torch
 from safetensors.torch import _TYPES
 
+try:
+    from tensornvme.async_file_io import AsyncFileWriter
+except ModuleNotFoundError:
+    raise ModuleNotFoundError("Please install tensornvme to use NVMeOptimizer")
 _TYPES_INV = {v: k for k, v in _TYPES.items()}
 
 
@@ -47,3 +51,14 @@ def prepare(data: Dict[str, torch.Tensor]) -> Tuple[PreparedData, List[torch.Ten
     n = len(metadata_buf)
 
     return PreparedData(n=n, header_bytes=metadata_buf, offset=offset), tensors
+
+
+def save(f_writer: AsyncFileWriter, state_dict: Dict[str, torch.Tensor]) -> None:
+    prepared_data, tensors = prepare(state_dict)
+    n, header_bytes, _ = prepared_data.n, prepared_data.header_bytes, prepared_data.offset
+
+    f_writer.write(n.to_bytes(8, byteorder="little"))
+    f_writer.write(header_bytes)
+
+    for tensor in tensors:
+        f_writer.write_raw(tensor, tensor.data_ptr(), tensor.numel() * tensor.element_size(), f_writer.offset)

From 5ddad486cab3ef067d6ae0ab87475d52f34dc27f Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Fri, 18 Oct 2024 13:55:31 +0800
Subject: [PATCH 143/167] [fp8] add fallback and make compile option
 configurable (#6092)

---
 colossalai/quantization/fp8.py        | 6 +++++-
 colossalai/quantization/fp8_config.py | 1 +
 2 files changed, 6 insertions(+), 1 deletion(-)
 create mode 100644 colossalai/quantization/fp8_config.py

diff --git a/colossalai/quantization/fp8.py b/colossalai/quantization/fp8.py
index 8243a29ac825..e23da5cccd4d 100644
--- a/colossalai/quantization/fp8.py
+++ b/colossalai/quantization/fp8.py
@@ -8,6 +8,8 @@
 from packaging.version import Version
 from torch.distributed import ReduceOp
 
+from .fp8_config import dynamic_kernel
+
 SUPPORT_TORCH_COMPILE = Version(torch.__version__) >= Version("2.4.0")
 SCALE_BYTES = 4
 try:
@@ -832,11 +834,13 @@ def backward(ctx: Any, out_grad) -> Any:
         return x_grad.reshape(ctx.x_shape), w_grad, bias_grad
 
 
-@torch.compile(mode="max-autotune-no-cudagraphs", disable=not SUPPORT_TORCH_COMPILE, dynamic=False)
+@torch.compile(mode="max-autotune-no-cudagraphs", disable=not SUPPORT_TORCH_COMPILE, dynamic=dynamic_kernel)
 def _linear_fp8(input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor:
     return _LinearFp8.apply(input, weight, bias)
 
 
 def linear_fp8(input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    if input.shape[-1] % 16 != 0 or np.prod(input.shape[:-1]) % 16 != 0:
+        return F.linear(input, weight, bias)
     out = _linear_fp8(input, weight, bias)
     return out
diff --git a/colossalai/quantization/fp8_config.py b/colossalai/quantization/fp8_config.py
new file mode 100644
index 000000000000..efa6251856aa
--- /dev/null
+++ b/colossalai/quantization/fp8_config.py
@@ -0,0 +1 @@
+dynamic_kernel: bool = False

From 58d8b8a2dd9a92c1dab3a44d2a35fb30716437c5 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Fri, 18 Oct 2024 16:48:52 +0800
Subject: [PATCH 144/167] [misc] fit torch api upgradation and remove legecy
 import (#6093)

* [amp] fit torch's new api

* [amp] fix api call

* [amp] fix api call

* [misc] fit torch pytree api upgrade

* [misc] remove legacy import

* [misc] fit torch amp api

* [misc] fit torch amp api
---
 colossalai/accelerator/cuda_accelerator.py            |  2 +-
 colossalai/kernel/jit/option.py                       |  2 +-
 colossalai/pipeline/schedule/_utils.py                | 10 ++++++++--
 .../zero/gemini/memory_tracer/runtime_mem_tracer.py   | 11 ++++++-----
 colossalai/zero/gemini/placement_policy.py            |  3 ++-
 .../features/mixed_precision_training_with_booster.md |  2 +-
 .../features/mixed_precision_training_with_booster.md |  2 +-
 7 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/colossalai/accelerator/cuda_accelerator.py b/colossalai/accelerator/cuda_accelerator.py
index f1ab487d4f58..32e62b33f86b 100644
--- a/colossalai/accelerator/cuda_accelerator.py
+++ b/colossalai/accelerator/cuda_accelerator.py
@@ -279,4 +279,4 @@ def autocast(
         """
         Return autocast function
         """
-        return torch.cuda.amp.autocast(enabled=enabled, dtype=dtype, cache_enabled=cache_enabled)
+        return torch.amp.autocast(device_type="cuda", enabled=enabled, dtype=dtype, cache_enabled=cache_enabled)
diff --git a/colossalai/kernel/jit/option.py b/colossalai/kernel/jit/option.py
index d392649a62f2..1ee93e4e0d9f 100644
--- a/colossalai/kernel/jit/option.py
+++ b/colossalai/kernel/jit/option.py
@@ -1,7 +1,6 @@
 import torch
 
 from colossalai.accelerator import get_accelerator
-from colossalai.legacy.nn.layer.colossalai_layer import Embedding, Linear
 
 from .bias_dropout_add import bias_dropout_add_fused_train
 from .bias_gelu import bias_gelu_impl
@@ -45,6 +44,7 @@ def warmup_jit_fusion(
     dtype: torch.dtype = torch.float32,
 ):
     """Compile JIT functions before the main training steps"""
+    from colossalai.legacy.nn.layer.colossalai_layer import Embedding, Linear
 
     embed = Embedding(vocab_size, hidden_size).to(get_accelerator().get_current_device())
     linear_1 = Linear(hidden_size, hidden_size * 4, skip_bias_add=True).to(get_accelerator().get_current_device())
diff --git a/colossalai/pipeline/schedule/_utils.py b/colossalai/pipeline/schedule/_utils.py
index 271b3238f5c4..8f42a9014e85 100644
--- a/colossalai/pipeline/schedule/_utils.py
+++ b/colossalai/pipeline/schedule/_utils.py
@@ -3,8 +3,9 @@
 
 import torch
 import torch.cuda
+from packaging.version import Version
 from torch.nn import Module
-from torch.utils._pytree import SUPPORTED_NODES, TreeSpec, _register_pytree_node, tree_flatten, tree_map, tree_unflatten
+from torch.utils._pytree import SUPPORTED_NODES, TreeSpec, tree_flatten, tree_map, tree_unflatten
 
 
 # this register are for torch under version 1.13.1, maybe removed in the future
@@ -16,7 +17,12 @@ def _odict_unflatten(values: List[Any], context: Any) -> "OrderedDict[Any, Any]"
     return OrderedDict((key, value) for key, value in zip(context, values))
 
 
-_register_pytree_node(OrderedDict, _odict_flatten, _odict_unflatten)
+if Version(torch.__version__) <= Version("1.13.1"):
+    try:
+        from torch.utils._pytree import register_pytree_node as _register_pytree_node
+    except ImportError:
+        from torch.utils._pytree import _register_pytree_node
+    _register_pytree_node(OrderedDict, _odict_flatten, _odict_unflatten)
 
 
 def tree_map_hf(fn: Any, pytree: Any):
diff --git a/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py b/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py
index b0d258824d2b..81520326f4cb 100644
--- a/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py
+++ b/colossalai/zero/gemini/memory_tracer/runtime_mem_tracer.py
@@ -1,10 +1,5 @@
 import torch.nn
 
-from colossalai.legacy.zero.gemini.ophooks.runtime_mem_tracer_hook import (
-    GradMemStats,
-    GradMemTracerHook,
-    ParamMemTracerHook,
-)
 from colossalai.tensor.param_op_hook import ColoParamOpHookManager
 from colossalai.utils import _cast_float
 
@@ -27,6 +22,12 @@ class RuntimeMemTracer:
 
     def __init__(self, module: torch.nn.Module, dtype: torch.dtype = torch.half):
         super().__init__()
+        from colossalai.legacy.zero.gemini.ophooks.runtime_mem_tracer_hook import (
+            GradMemStats,
+            GradMemTracerHook,
+            ParamMemTracerHook,
+        )
+
         self.module = module
         self.dtype = dtype
         self._gradstat = GradMemStats()
diff --git a/colossalai/zero/gemini/placement_policy.py b/colossalai/zero/gemini/placement_policy.py
index 178755d03107..2aa8dc3f6cdd 100644
--- a/colossalai/zero/gemini/placement_policy.py
+++ b/colossalai/zero/gemini/placement_policy.py
@@ -8,7 +8,6 @@
 import torch.distributed as dist
 
 from colossalai.accelerator import get_accelerator
-from colossalai.legacy.utils.memory import colo_device_memory_capacity
 from colossalai.zero.gemini.chunk import Chunk
 
 from .chunk import Chunk, ChunkManager
@@ -172,6 +171,8 @@ def evict_tensors(
         Returns:
             int: the volume of memory that is evicted
         """
+        from colossalai.legacy.utils.memory import colo_device_memory_capacity
+
         start = time()
         cuda_capacity = colo_device_memory_capacity(get_accelerator().get_current_device())
         used_cuda_model_data = self.chunk_manager.total_mem["cuda"]
diff --git a/docs/source/en/features/mixed_precision_training_with_booster.md b/docs/source/en/features/mixed_precision_training_with_booster.md
index 65304b1f4e65..1e17c2bb584d 100644
--- a/docs/source/en/features/mixed_precision_training_with_booster.md
+++ b/docs/source/en/features/mixed_precision_training_with_booster.md
@@ -16,7 +16,7 @@ Author: [Mingyan Jiang](https://github.com/jiangmingyan)
 AMP stands for automatic mixed precision training.
 In Colossal-AI, we have incorporated different implementations of mixed precision training:
 
-1. torch.cuda.amp
+1. torch.amp
 2. apex.amp
 3. naive amp
 
diff --git a/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md b/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
index da377ceb294b..93a69830cadf 100644
--- a/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
+++ b/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
@@ -16,7 +16,7 @@
 AMP 代表自动混合精度训练。
 在 Colossal-AI 中, 我们结合了混合精度训练的不同实现:
 
-1. torch.cuda.amp
+1. torch.amp
 2. apex.amp
 3. naive amp
 

From 19baab5fd5c35ccff1ff0790bf846386e87fd029 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Mon, 21 Oct 2024 10:19:08 +0800
Subject: [PATCH 145/167] [release] update version (#6094)

---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index 6f2743d65dc0..0bfccb080404 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.4.4
+0.4.5

From b10339df7cfac1eee6945df37a80fd1c38f42289 Mon Sep 17 00:00:00 2001
From: BurkeHulk <hangxu0304@gmail.com>
Date: Mon, 21 Oct 2024 13:55:43 +0800
Subject: [PATCH 146/167] fix lora ckpt save format (ColoTensor to Tensor)

---
 colossalai/booster/plugin/low_level_zero_plugin.py        | 3 ++-
 colossalai/booster/plugin/torch_ddp_plugin.py             | 6 +++++-
 colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py | 5 ++++-
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py
index b167b5c7a59e..97fabe63a931 100644
--- a/colossalai/booster/plugin/low_level_zero_plugin.py
+++ b/colossalai/booster/plugin/low_level_zero_plugin.py
@@ -290,7 +290,8 @@ def save_lora_as_pretrained(self, model, checkpoint, use_safetensors):
         assert isinstance(
             peft_model, PeftModel
         ), "The model doesn't have lora adapters, please enable lora before saving."
-        return peft_model.save_pretrained(checkpoint, safe_serialization=use_safetensors)
+        return peft_model.save_pretrained(checkpoint, safe_serialization=use_safetensors,
+                                          state_dict = tree_map(lambda x: x.data if torch.is_tensor(x) else x, peft_model.state_dict()))
 
 
 class LowLevelZeroPlugin(DPPluginBase):
diff --git a/colossalai/booster/plugin/torch_ddp_plugin.py b/colossalai/booster/plugin/torch_ddp_plugin.py
index ec7ce7f9aae4..aa4d35cd4fdb 100644
--- a/colossalai/booster/plugin/torch_ddp_plugin.py
+++ b/colossalai/booster/plugin/torch_ddp_plugin.py
@@ -1,10 +1,12 @@
 from typing import Callable, Dict, Iterator, List, Optional, Tuple, Union
 
+import torch
 import torch.nn as nn
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
 from torch.utils.data import DataLoader
+from torch.utils._pytree import tree_map
 
 from colossalai.checkpoint_io import CheckpointIO, GeneralCheckpointIO
 from colossalai.cluster import DistCoordinator
@@ -134,7 +136,9 @@ def save_lora_as_pretrained(
             assert isinstance(
                 peft_model, PeftModel
             ), "The model doesn't have lora adapters, please enable lora before saving."
-            peft_model.save_pretrained(save_directory=checkpoint, safe_serialization=use_safetensors)
+            return peft_model.save_pretrained(checkpoint, safe_serialization=use_safetensors,
+                                              state_dict=tree_map(lambda x: x.data if torch.is_tensor(x) else x,
+                                                                  peft_model.state_dict()))
 
 
 class TorchDDPModel(ModelWrapper):
diff --git a/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py b/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
index 3b6917d32fa6..4ca1353d854d 100644
--- a/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
+++ b/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
@@ -11,6 +11,7 @@
 import torch.nn as nn
 from torch.distributed import ProcessGroup
 from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
+from torch.utils._pytree import tree_map
 
 from colossalai.cluster import DistCoordinator
 from colossalai.interface import ModelWrapper, OptimizerWrapper
@@ -956,4 +957,6 @@ def save_lora_as_pretrained(self, model, checkpoint, use_safetensors):
         assert isinstance(
             peft_model, PeftModel
         ), "The model doesn't have lora adapters, please enable lora before saving."
-        return peft_model.save_pretrained(checkpoint, safe_serialization=use_safetensors)
+        return peft_model.save_pretrained(checkpoint, safe_serialization=use_safetensors,
+                                          state_dict=tree_map(lambda x: x.data if torch.is_tensor(x) else x,
+                                                              peft_model.state_dict()))

From 6d6cafabe28e6665c35fb585ac162237e09a789b Mon Sep 17 00:00:00 2001
From: BurkeHulk <hangxu0304@gmail.com>
Date: Mon, 21 Oct 2024 14:04:32 +0800
Subject: [PATCH 147/167] pre-commit fix

---
 colossalai/booster/plugin/low_level_zero_plugin.py     |  7 +++++--
 colossalai/booster/plugin/torch_ddp_plugin.py          | 10 ++++++----
 .../checkpoint_io/hybrid_parallel_checkpoint_io.py     |  8 +++++---
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/colossalai/booster/plugin/low_level_zero_plugin.py b/colossalai/booster/plugin/low_level_zero_plugin.py
index 97fabe63a931..f3a6901ada6b 100644
--- a/colossalai/booster/plugin/low_level_zero_plugin.py
+++ b/colossalai/booster/plugin/low_level_zero_plugin.py
@@ -290,8 +290,11 @@ def save_lora_as_pretrained(self, model, checkpoint, use_safetensors):
         assert isinstance(
             peft_model, PeftModel
         ), "The model doesn't have lora adapters, please enable lora before saving."
-        return peft_model.save_pretrained(checkpoint, safe_serialization=use_safetensors,
-                                          state_dict = tree_map(lambda x: x.data if torch.is_tensor(x) else x, peft_model.state_dict()))
+        return peft_model.save_pretrained(
+            checkpoint,
+            safe_serialization=use_safetensors,
+            state_dict=tree_map(lambda x: x.data if torch.is_tensor(x) else x, peft_model.state_dict()),
+        )
 
 
 class LowLevelZeroPlugin(DPPluginBase):
diff --git a/colossalai/booster/plugin/torch_ddp_plugin.py b/colossalai/booster/plugin/torch_ddp_plugin.py
index aa4d35cd4fdb..156a4acf92d5 100644
--- a/colossalai/booster/plugin/torch_ddp_plugin.py
+++ b/colossalai/booster/plugin/torch_ddp_plugin.py
@@ -5,8 +5,8 @@
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
-from torch.utils.data import DataLoader
 from torch.utils._pytree import tree_map
+from torch.utils.data import DataLoader
 
 from colossalai.checkpoint_io import CheckpointIO, GeneralCheckpointIO
 from colossalai.cluster import DistCoordinator
@@ -136,9 +136,11 @@ def save_lora_as_pretrained(
             assert isinstance(
                 peft_model, PeftModel
             ), "The model doesn't have lora adapters, please enable lora before saving."
-            return peft_model.save_pretrained(checkpoint, safe_serialization=use_safetensors,
-                                              state_dict=tree_map(lambda x: x.data if torch.is_tensor(x) else x,
-                                                                  peft_model.state_dict()))
+            return peft_model.save_pretrained(
+                checkpoint,
+                safe_serialization=use_safetensors,
+                state_dict=tree_map(lambda x: x.data if torch.is_tensor(x) else x, peft_model.state_dict()),
+            )
 
 
 class TorchDDPModel(ModelWrapper):
diff --git a/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py b/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
index 4ca1353d854d..e6abf59e3ac3 100644
--- a/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
+++ b/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
@@ -957,6 +957,8 @@ def save_lora_as_pretrained(self, model, checkpoint, use_safetensors):
         assert isinstance(
             peft_model, PeftModel
         ), "The model doesn't have lora adapters, please enable lora before saving."
-        return peft_model.save_pretrained(checkpoint, safe_serialization=use_safetensors,
-                                          state_dict=tree_map(lambda x: x.data if torch.is_tensor(x) else x,
-                                                              peft_model.state_dict()))
+        return peft_model.save_pretrained(
+            checkpoint,
+            safe_serialization=use_safetensors,
+            state_dict=tree_map(lambda x: x.data if torch.is_tensor(x) else x, peft_model.state_dict()),
+        )

From 80a8ca916a740e913cbedf60caeadc0bab5cb4fa Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Thu, 24 Oct 2024 11:11:44 +0800
Subject: [PATCH 148/167] [extension] hotfix compile check (#6099)

---
 extensions/cpp_extension.py  | 2 +-
 extensions/cuda_extension.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/extensions/cpp_extension.py b/extensions/cpp_extension.py
index aaa43f964c25..a92195dda3d9 100644
--- a/extensions/cpp_extension.py
+++ b/extensions/cpp_extension.py
@@ -79,7 +79,7 @@ def build_jit(self) -> None:
 
         # check if the kernel has been built
         compiled_before = False
-        kernel_file_path = build_directory.joinpath(f"{self.name}.o")
+        kernel_file_path = build_directory.joinpath(f"{self.name}.so")
         if kernel_file_path.exists():
             compiled_before = True
 
diff --git a/extensions/cuda_extension.py b/extensions/cuda_extension.py
index da15bcd57a26..214b83ec83cc 100644
--- a/extensions/cuda_extension.py
+++ b/extensions/cuda_extension.py
@@ -74,7 +74,7 @@ def build_jit(self) -> None:
 
         # check if the kernel has been built
         compiled_before = False
-        kernel_file_path = build_directory.joinpath(f"{self.name}.o")
+        kernel_file_path = build_directory.joinpath(f"{self.name}.so")
         if kernel_file_path.exists():
             compiled_before = True
 

From 4294ae83bba9dc36dccee47ea7d9b5b6c2de87d7 Mon Sep 17 00:00:00 2001
From: binmakeswell <binmakeswell@gmail.com>
Date: Thu, 24 Oct 2024 13:24:37 +0800
Subject: [PATCH 149/167] [doc] sora solution news (#6100)

* [doc] sora solution news

* [doc] sora solution news
---
 README.md              | 5 +----
 docs/README-zh-Hans.md | 6 +-----
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index bf26e154f0b0..7b2004dfc26c 100644
--- a/README.md
+++ b/README.md
@@ -48,16 +48,13 @@ Special Bonuses:
 
 
 ## Latest News
+* [2024/10] [How to build a low-cost Sora-like app? Solutions for you](https://company.hpc-ai.com/blog/how-to-build-a-low-cost-sora-like-app-solutions-for-you)
 * [2024/09] [Singapore Startup HPC-AI Tech Secures 50 Million USD in Series A Funding to Build the Video Generation AI Model and GPU Platform](https://company.hpc-ai.com/blog/singapore-startup-hpc-ai-tech-secures-50-million-usd-in-series-a-funding-to-build-the-video-generation-ai-model-and-gpu-platform)
 * [2024/09] [Reducing AI Large Model Training Costs by 30% Requires Just a Single Line of Code From FP8 Mixed Precision Training Upgrades](https://company.hpc-ai.com/blog/reducing-ai-large-model-training-costs-by-30-requires-just-a-single-line-of-code-from-fp8-mixed-precision-training-upgrades)
 * [2024/06] [Open-Sora Continues Open Source: Generate Any 16-Second 720p HD Video with One Click, Model Weights Ready to Use](https://hpc-ai.com/blog/open-sora-from-hpc-ai-tech-team-continues-open-source-generate-any-16-second-720p-hd-video-with-one-click-model-weights-ready-to-use)
 * [2024/05] [Large AI Models Inference Speed Doubled, Colossal-Inference Open Source Release](https://hpc-ai.com/blog/colossal-inference)
 * [2024/04] [Open-Sora Unveils Major Upgrade: Embracing Open Source with Single-Shot 16-Second Video Generation and 720p Resolution](https://hpc-ai.com/blog/open-soras-comprehensive-upgrade-unveiled-embracing-16-second-video-generation-and-720p-resolution-in-open-source)
 * [2024/04] [Most cost-effective solutions for inference, fine-tuning and pretraining, tailored to LLaMA3 series](https://hpc-ai.com/blog/most-cost-effective-solutions-for-inference-fine-tuning-and-pretraining-tailored-to-llama3-series)
-* [2024/03] [314 Billion Parameter Grok-1 Inference Accelerated by 3.8x, Efficient and Easy-to-Use PyTorch+HuggingFace version is Here](https://hpc-ai.com/blog/314-billion-parameter-grok-1-inference-accelerated-by-3.8x-efficient-and-easy-to-use-pytorchhuggingface-version-is-here)
-* [2024/03] [Open-Sora: Revealing Complete Model Parameters, Training Details, and Everything for Sora-like Video Generation Models](https://hpc-ai.com/blog/open-sora-v1.0)
-* [2024/03] [Open-Sora：Sora Replication Solution with 46% Cost Reduction, Sequence Expansion to Nearly a Million](https://hpc-ai.com/blog/open-sora)
-* [2024/01] [Inference Performance Improved by 46%, Open Source Solution Breaks the Length Limit of LLM for Multi-Round Conversations](https://hpc-ai.com/blog/Colossal-AI-SwiftInfer)
 
 ## Table of Contents
 <ul>
diff --git a/docs/README-zh-Hans.md b/docs/README-zh-Hans.md
index 2f3075bbd71a..c32e5f13c913 100644
--- a/docs/README-zh-Hans.md
+++ b/docs/README-zh-Hans.md
@@ -25,17 +25,13 @@
 </div>
 
 ## 新闻
+* [2024/10] [How to build a low-cost Sora-like app? Solutions for you](https://company.hpc-ai.com/blog/how-to-build-a-low-cost-sora-like-app-solutions-for-you)
 * [2024/09] [Singapore Startup HPC-AI Tech Secures 50 Million USD in Series A Funding to Build the Video Generation AI Model and GPU Platform](https://company.hpc-ai.com/blog/singapore-startup-hpc-ai-tech-secures-50-million-usd-in-series-a-funding-to-build-the-video-generation-ai-model-and-gpu-platform)
 * [2024/09] [Reducing AI Large Model Training Costs by 30% Requires Just a Single Line of Code From FP8 Mixed Precision Training Upgrades](https://company.hpc-ai.com/blog/reducing-ai-large-model-training-costs-by-30-requires-just-a-single-line-of-code-from-fp8-mixed-precision-training-upgrades)
 * [2024/06] [Open-Sora Continues Open Source: Generate Any 16-Second 720p HD Video with One Click, Model Weights Ready to Use](https://hpc-ai.com/blog/open-sora-from-hpc-ai-tech-team-continues-open-source-generate-any-16-second-720p-hd-video-with-one-click-model-weights-ready-to-use)
 * [2024/05] [Large AI Models Inference Speed Doubled, Colossal-Inference Open Source Release](https://hpc-ai.com/blog/colossal-inference)
 * [2024/04] [Open-Sora Unveils Major Upgrade: Embracing Open Source with Single-Shot 16-Second Video Generation and 720p Resolution](https://hpc-ai.com/blog/open-soras-comprehensive-upgrade-unveiled-embracing-16-second-video-generation-and-720p-resolution-in-open-source)
 * [2024/04] [Most cost-effective solutions for inference, fine-tuning and pretraining, tailored to LLaMA3 series](https://hpc-ai.com/blog/most-cost-effective-solutions-for-inference-fine-tuning-and-pretraining-tailored-to-llama3-series)
-* [2024/03] [314 Billion Parameter Grok-1 Inference Accelerated by 3.8x, Efficient and Easy-to-Use PyTorch+HuggingFace version is Here](https://hpc-ai.com/blog/314-billion-parameter-grok-1-inference-accelerated-by-3.8x-efficient-and-easy-to-use-pytorchhuggingface-version-is-here)
-* [2024/03] [Open-Sora: Revealing Complete Model Parameters, Training Details, and Everything for Sora-like Video Generation Models](https://hpc-ai.com/blog/open-sora-v1.0)
-* [2024/03] [Open-Sora：Sora Replication Solution with 46% Cost Reduction, Sequence Expansion to Nearly a Million](https://hpc-ai.com/blog/open-sora)
-* [2024/01] [Inference Performance Improved by 46%, Open Source Solution Breaks the Length Limit of LLM for Multi-Round Conversations](https://hpc-ai.com/blog/Colossal-AI-SwiftInfer)
-* [2023/07] [HPC-AI Tech Raises 22 Million USD in Series A Funding](https://www.hpc-ai.tech/blog/hpc-ai-tech-raises-22-million-usd-in-series-a-funding-to-fuel-team-expansion-and-business-growth)
 
 ## 目录
 <ul>

From 2eca112c90001223fec9a367362093422ba7b2c0 Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Thu, 24 Oct 2024 07:30:19 +0000
Subject: [PATCH 150/167] [feat] support meta cache, meta_grad_send,
 meta_tensor_send; fix runtime too long in Recv Bwd; benchmark for llama +
 Hybrid(tp+pp);

---
 .../pipeline/schedule/zero_bubble_pp.py       | 107 +++++++++++++-----
 colossalai/pipeline/stage_manager.py          |   2 +-
 colossalai/pipeline/weight_grad_store.py      |  55 ++++++++-
 colossalai/shardformer/modeling/llama.py      |   7 ++
 colossalai/shardformer/policies/llama.py      |  27 +++--
 examples/language/llama/benchmark.py          |  20 ++--
 examples/language/performance_evaluator.py    |  15 ++-
 .../test_schedule/test_zerobubble_pp.py       |  16 +--
 8 files changed, 185 insertions(+), 64 deletions(-)

diff --git a/colossalai/pipeline/schedule/zero_bubble_pp.py b/colossalai/pipeline/schedule/zero_bubble_pp.py
index e155284bfc1b..408cdffc22a2 100644
--- a/colossalai/pipeline/schedule/zero_bubble_pp.py
+++ b/colossalai/pipeline/schedule/zero_bubble_pp.py
@@ -8,7 +8,7 @@
 
 from colossalai.accelerator import get_accelerator
 from colossalai.interface import OptimizerWrapper
-from colossalai.pipeline.p2p import PipelineP2PCommunication
+from colossalai.pipeline.p2p import PipelineP2PCommunication, create_send_metadata
 from colossalai.pipeline.schedule.v_schedule import ScheduledNode
 from colossalai.pipeline.stage_manager import PipelineStageManager
 from colossalai.pipeline.weight_grad_store import WeightGradStore
@@ -62,11 +62,11 @@ def __init__(
         self.do_post_validation = False
 
         # P2PMeta cache
-        # self.enable_metadata_cache = enable_metadata_cache
-        # self.send_tensor_metadata = True
-        # self.send_grad_metadata = True
-        # self.tensor_metadata_recv = None
-        # self.grad_metadata_recv = None
+        self.enable_metadata_cache = enable_metadata_cache
+        self.send_tensor_metadata = True
+        self.send_grad_metadata = True
+        self.tensor_metadata_recv = None
+        self.grad_metadata_recv = None
 
         # P2P communication
         self.comm = PipelineP2PCommunication(stage_manager, overlap_p2p=overlap_p2p)
@@ -105,8 +105,11 @@ def _free_buffers(self):
         # dy buffer for local send bwd
         self.local_send_backward_buffer = []
 
+        # wait pp buffer
+        self.send_handles = []
+
     def assert_buffer_empty(self):
-        # assert buuffer is empty at end
+        # assert buffer is empty at end
         assert len(self.input_tensors[0]) == 0
         assert len(self.input_tensors[1]) == 0
         assert len(self.output_tensors[0]) == 0
@@ -125,6 +128,7 @@ def assert_buffer_empty(self):
         assert len(self.recv_backward_buffer[1]) == 0
         assert len(self.local_send_forward_buffer) == 0
         assert len(self.local_send_backward_buffer) == 0
+        # assert len(self.send_handles) == 0
 
     def load_batch(self, data_iter: Iterable, device: Optional[torch.device] = None) -> None:
         """Load a batch from data iterator.
@@ -221,7 +225,8 @@ def recv_forward(self, model_chunk_id: int, prev_rank: int = None) -> Tuple[Any,
                 # do nothing; cause u are chunk 0 in first rank, u have no prev rank;
                 #################
                 if self.stage_manager.is_first_stage(ignore_chunk=True):
-                    return None, []
+                    # return None, []
+                    return []
 
                 ################
                 # chunk = 0 & not is_first_stage
@@ -229,9 +234,14 @@ def recv_forward(self, model_chunk_id: int, prev_rank: int = None) -> Tuple[Any,
                 #################
                 else:
                     prev_rank = self.stage_manager.get_prev_rank()
-                    input_tensor, wait_handles = self.comm.recv_forward(prev_rank=prev_rank)
+                    input_tensor, wait_handles = self.comm.recv_forward(
+                        prev_rank=prev_rank, metadata_recv=self.tensor_metadata_recv
+                    )
+                    if self.enable_metadata_cache and self.tensor_metadata_recv is None:
+                        self.tensor_metadata_recv = create_send_metadata(input_tensor)
                     self.recv_forward_buffer[model_chunk_id].append(input_tensor)
-                    return input_tensor, wait_handles
+                    # return input_tensor, wait_handles
+                    return wait_handles
 
             else:
                 ################
@@ -239,7 +249,8 @@ def recv_forward(self, model_chunk_id: int, prev_rank: int = None) -> Tuple[Any,
                 # do nothing; cause u get y from local_send_forward_buffer in schedule f
                 ################
                 if self.stage_manager.is_last_stage(ignore_chunk=True):
-                    return None, []
+                    # return None, []
+                    return []
 
                 ################
                 # chunk = 1 & not is_last_stage
@@ -247,9 +258,14 @@ def recv_forward(self, model_chunk_id: int, prev_rank: int = None) -> Tuple[Any,
                 ################
                 else:
                     next_rank = self.stage_manager.get_next_rank()
-                    input_tensor, wait_handles = self.comm.recv_forward(next_rank)
+                    input_tensor, wait_handles = self.comm.recv_forward(
+                        next_rank, metadata_recv=self.tensor_metadata_recv
+                    )
+                    if self.enable_metadata_cache and self.tensor_metadata_recv is None:
+                        self.tensor_metadata_recv = create_send_metadata(input_tensor)
                     self.recv_forward_buffer[model_chunk_id].append(input_tensor)
-                    return input_tensor, wait_handles
+                    # return input_tensor, wait_handles
+                    return wait_handles
 
     def recv_backward(self, model_chunk_id: int, next_rank: int = None) -> Tuple[Any, List]:
         """Copy the gradient tensor from the next stage in pipeline as the input gradient of this stage.
@@ -271,7 +287,8 @@ def recv_backward(self, model_chunk_id: int, next_rank: int = None) -> Tuple[Any
                 # do nothing; Already get dy from local_send_backward_buffer in schedule b
                 ################
                 if self.stage_manager.is_last_stage(ignore_chunk=True):
-                    return None, []
+                    # return None, []
+                    return []
 
                 ################
                 # chunk = 0 & not is_last_stage
@@ -279,9 +296,14 @@ def recv_backward(self, model_chunk_id: int, next_rank: int = None) -> Tuple[Any
                 ################
                 else:
                     next_rank = self.stage_manager.get_next_rank()
-                    output_tensor_grad, wait_handles = self.comm.recv_backward(next_rank)
+                    output_tensor_grad, wait_handles = self.comm.recv_backward(
+                        next_rank, metadata_recv=self.grad_metadata_recv
+                    )
+                    if self.enable_metadata_cache and self.grad_metadata_recv is None:
+                        self.grad_metadata_recv = create_send_metadata(output_tensor_grad)
                     self.recv_backward_buffer[model_chunk_id].append(output_tensor_grad)
-                    return output_tensor_grad, wait_handles
+                    # return output_tensor_grad, wait_handles
+                    return wait_handles
 
             else:
                 # bwd chunk1 is left V;
@@ -290,7 +312,8 @@ def recv_backward(self, model_chunk_id: int, next_rank: int = None) -> Tuple[Any
                 # do nothing; get loss from local
                 ################
                 if self.stage_manager.is_first_stage(ignore_chunk=True):
-                    return None, []
+                    # return None, []
+                    return []
 
                 ################
                 # chunk = 1 & not first stage
@@ -298,9 +321,14 @@ def recv_backward(self, model_chunk_id: int, next_rank: int = None) -> Tuple[Any
                 ################
                 else:
                     prev_rank = self.stage_manager.get_prev_rank()
-                    output_tensor_grad, wait_handles = self.comm.recv_backward(next_rank=prev_rank)
+                    output_tensor_grad, wait_handles = self.comm.recv_backward(
+                        next_rank=prev_rank, metadata_recv=self.grad_metadata_recv
+                    )
+                    if self.enable_metadata_cache and self.grad_metadata_recv is None:
+                        self.grad_metadata_recv = create_send_metadata(output_tensor_grad)
                     self.recv_backward_buffer[model_chunk_id].append(output_tensor_grad)
-                    return output_tensor_grad, wait_handles
+                    # return output_tensor_grad, wait_handles
+                    return wait_handles
 
     def send_forward(self, model_chunk_id: int, next_rank: int = None) -> List:
         """Sends the input tensor to the next stage in pipeline.
@@ -330,7 +358,10 @@ def send_forward(self, model_chunk_id: int, next_rank: int = None) -> List:
                 else:
                     next_rank = self.stage_manager.get_next_rank()
                     output_tensor = self.send_forward_buffer[model_chunk_id].pop(0)
-                    send_handles = self.comm.send_forward(output_object=output_tensor, next_rank=next_rank)
+                    send_handles = self.comm.send_forward(
+                        output_object=output_tensor, next_rank=next_rank, send_metadata=self.send_tensor_metadata
+                    )
+                    self.send_tensor_metadata = not self.enable_metadata_cache
                     return send_handles
 
             else:
@@ -348,7 +379,10 @@ def send_forward(self, model_chunk_id: int, next_rank: int = None) -> List:
                 else:
                     prev_rank = self.stage_manager.get_prev_rank()
                     output_tensor = self.send_forward_buffer[model_chunk_id].pop(0)
-                    send_handles = self.comm.send_forward(output_tensor, prev_rank)
+                    send_handles = self.comm.send_forward(
+                        output_tensor, prev_rank, send_metadata=self.send_tensor_metadata
+                    )
+                    self.send_tensor_metadata = not self.enable_metadata_cache
                     return send_handles
 
     def send_backward(self, model_chunk_id: int, prev_rank: int = None) -> List:
@@ -380,7 +414,10 @@ def send_backward(self, model_chunk_id: int, prev_rank: int = None) -> List:
                 else:
                     prev_rank = self.stage_manager.get_prev_rank()
                     input_tensor_grad = self.send_backward_buffer[model_chunk_id].pop(0)
-                    send_handles = self.comm.send_backward(input_tensor_grad, prev_rank)
+                    send_handles = self.comm.send_backward(
+                        input_tensor_grad, prev_rank, send_metadata=self.send_grad_metadata
+                    )
+                    self.send_grad_metadata = not self.enable_metadata_cache
                     return send_handles
 
             # bwd chunk1 is left V;
@@ -399,7 +436,10 @@ def send_backward(self, model_chunk_id: int, prev_rank: int = None) -> List:
                 else:
                     next_rank = self.stage_manager.get_next_rank()
                     input_tensor_grad = self.send_backward_buffer[model_chunk_id].pop(0)
-                    send_handles = self.comm.send_backward(input_tensor_grad, next_rank)
+                    send_handles = self.comm.send_backward(
+                        input_tensor_grad, next_rank, send_metadata=self.send_grad_metadata
+                    )
+                    self.send_grad_metadata = not self.enable_metadata_cache
                     return send_handles
 
     def forward_step(
@@ -479,11 +519,11 @@ def backward_b_step(
         output_obj_grad_ = []
 
         # For chunk 0 stage 0, use micro_batch as input_obj_; and we don't have to cal microbatch dx.
-        if model_chunk_id == 0 and self.stage_manager.is_first_stage(ignore_chunk=True):
-            return None
+        # if model_chunk_id == 0 and self.stage_manager.is_first_stage(ignore_chunk=True):
+        #     return None
 
         # For loss backward; output_obj is loss; output_obj_grad should be None
-        elif model_chunk_id == 1 and self.stage_manager.is_first_stage(ignore_chunk=True):
+        if model_chunk_id == 1 and self.stage_manager.is_first_stage(ignore_chunk=True):
             assert output_obj_grad is None
             input_obj_, _ = tree_flatten(input_obj)
             output_obj_.append(output_obj)  # LOSS
@@ -510,7 +550,7 @@ def backward_b_step(
                 tensor=output_obj_,
                 grad=output_obj_grad_,
                 # inputs=input_obj_,
-                # retain_graph=True,
+                retain_graph=False,
             )
         # Format output_obj_grad
         input_obj_grad = dict()
@@ -712,6 +752,12 @@ def schedule_b(
         # else:
         #     # we save output_tensor_grad here
         #     self.output_tensors_grad_dw[model_chunk_id].append(output_tensor_grad)
+        # the_output_obj_grad = []
+        # if isinstance(output_obj, dict):
+        #     for (k, v) in output_obj.items():
+        #         the_output_obj_grad.append(v.requires_grad)
+        # else:
+        #     the_output_obj_grad.append(output_obj.requires_grad)
 
         input_object_grad = self.backward_b_step(
             model_chunk=model_chunk,
@@ -844,7 +890,8 @@ def run_forward_backward(
             if scheduled_node.type in AUTO_SCHEDULE_COMMUNICATION_TYPES:
                 # communication
                 communication_func = self.communication_map[scheduled_node.type]
-                communication_func(scheduled_node.chunk)
+                wait_handle = communication_func(scheduled_node.chunk)
+                self.send_handles.append(wait_handle)
             elif scheduled_node.type == "F":
                 self.schedule_f(
                     scheduled_node=scheduled_node,
@@ -868,6 +915,9 @@ def run_forward_backward(
                     model_chunk_id=scheduled_node.chunk,
                     optimizer=optimizer,
                 )
+        for h in self.send_handles:
+            for hh in h:
+                hh.wait()
 
         # return loss & output
         if outputs is not None:
@@ -907,5 +957,4 @@ def forward_backward_step(
             )
 
         self.assert_buffer_empty()
-
         return result
diff --git a/colossalai/pipeline/stage_manager.py b/colossalai/pipeline/stage_manager.py
index 5cc32114daff..f30ab8e59964 100644
--- a/colossalai/pipeline/stage_manager.py
+++ b/colossalai/pipeline/stage_manager.py
@@ -223,10 +223,10 @@ def distribute_layers(
 
         # calculate the num_layers per stage
         layers_per_stage = [quotient] * num_stages * num_model_chunks
-
         # deal with the rest layers
         if remainder > 0:
             start_position = (num_stages * num_model_chunks) // 2 - remainder // 2
             for i in range(start_position, start_position + remainder):
                 layers_per_stage[i] += 1
+        # print(f"layers_per_stage {layers_per_stage}")
         return layers_per_stage
diff --git a/colossalai/pipeline/weight_grad_store.py b/colossalai/pipeline/weight_grad_store.py
index 12963350f462..dff4fdd0200e 100644
--- a/colossalai/pipeline/weight_grad_store.py
+++ b/colossalai/pipeline/weight_grad_store.py
@@ -1,9 +1,6 @@
 import queue
 
-# from megatron import get_args
-# from megatron.core import parallel_state
-# from megatron.core.distributed.finalize_model_grads import _allreduce_embedding_grads
-# from megatron.core.utils import get_model_config, get_attr_wrapped_model
+from colossalai.pipeline.stage_manager import PipelineStageManager
 
 
 class WeightGradStore:
@@ -23,6 +20,7 @@ def flush(cls, chunk=0):
 
     @classmethod
     def pop(cls, chunk=0):
+        # print(f"chunk id {chunk} queue size {cls.weight_grad_queue[chunk].qsize()}")
         if cls.weight_grad_queue[chunk].qsize() > 0:
             stored_grads = cls.weight_grad_queue[chunk].get()
             for total_input, grad_output, weight, func in stored_grads:
@@ -34,3 +32,52 @@ def pop(cls, chunk=0):
                     weight.grad = grad_weight
         else:
             raise Exception("Pop empty queue.")
+
+    @classmethod
+    def clear(cls, stage_manager: PipelineStageManager, chunk=0):
+        pass
+        # print(f"stage {stage_manager.stage} len_chunk_0 {cls.weight_grad_queue[0].qsize()} len_chunk_1 {cls.weight_grad_queue[1].qsize()}")
+        # while cls.weight_grad_queue[chunk].qsize() > 0:
+        #     stored_grads = cls.weight_grad_queue[chunk].get()
+        #     for total_input, grad_output, weight, func in stored_grads:
+        #         if weight.grad is not None:
+        #             func(total_input, grad_output, weight.grad)
+        #         # for first bwd; weight.grad is None, assign grad_weight to weight.grad
+        #         else:
+        #             grad_weight = func(total_input, grad_output)
+        #             weight.grad = grad_weight
+
+        # weight_grad_tasks = []
+        # while cls.weight_grad_queue[chunk].qsize() > 0:
+        #     stored_grads = cls.weight_grad_queue[chunk].get()
+        #     if len(weight_grad_tasks) == 0:
+        #         for _ in stored_grads:
+        #             weight_grad_tasks.append([])
+        #     else:
+        #         assert len(weight_grad_tasks) == len(stored_grads)
+        #     for i, task in enumerate(stored_grads):
+        #         weight_grad_tasks[i].append(task)
+
+        # if stage_manager.is_last_stage(ignore_chunk=True) and chunk == 1:
+        #     assert len(weight_grad_tasks) > 0
+        #     output_layer_grads = weight_grad_tasks[0]
+        #     for j in range(len(output_layer_grads)):
+        #         total_input, grad_output, weight, func = output_layer_grads[j]
+        #         if output_layer_weight is None:
+        #             output_layer_weight = weight
+        #         assert output_layer_weight is weight
+        #         func(total_input, grad_output, weight.grad)
+        #         output_layer_grads[j] = None  # release memory
+        #     weight_grad_tasks = weight_grad_tasks[1:]
+
+        # for i in range(len(weight_grad_tasks)):
+        #     tasks = weight_grad_tasks[i]
+        #     param = None
+        #     for j in range(len(tasks)):
+        #         total_input, grad_output, weight, func = tasks[j]
+        #         if param is None:
+        #             param = weight
+        #         assert param is weight
+        #         func(total_input, grad_output, weight.grad)
+        #         tasks[j] = None  # release memory
+        #     weight_grad_tasks[i] = None  # release memory
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index 7a04c5451cfc..a02db1168b8c 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -32,6 +32,7 @@
 from ..layer import ColoAttention, RingAttention, dist_cross_entropy
 
 _SUPPORTED_SP_MODE = ["all_to_all", "split_gather", "ring", "ring_attn"]
+_GLOBAL_ORDER_ = 0
 
 
 class LlamaPipelineForwards:
@@ -193,6 +194,10 @@ def llama_model_forward(
             assert num_ckpt_layers <= end_idx - start_idx
 
         for idx, decoder_layer in enumerate(self.layers[start_idx:end_idx], start=start_idx):
+            # global _GLOBAL_ORDER_
+            # if torch.distributed.get_rank() == 0:
+            #     print(f"rank {torch.distributed.get_rank()} {stage_manager.stage}; start:{start_idx}, end:{end_idx} hidden_states require grad{hidden_states.requires_grad}")
+            # # _GLOBAL_ORDER_ += 1
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
             if idx - start_idx < num_ckpt_layers:
@@ -216,6 +221,8 @@ def llama_model_forward(
                     use_cache=use_cache,
                     cache_position=cache_position,
                 )
+                # if torch.distributed.get_rank() == 0:
+                #     print(f"rank {torch.distributed.get_rank()} {stage_manager.stage}; start:{start_idx}, end:{end_idx} layer_outputs require grad {layer_outputs[0].requires_grad}")
             hidden_states = layer_outputs[0]
 
             if use_cache:
diff --git a/colossalai/shardformer/policies/llama.py b/colossalai/shardformer/policies/llama.py
index db4515d7ea65..8a980bf9d621 100644
--- a/colossalai/shardformer/policies/llama.py
+++ b/colossalai/shardformer/policies/llama.py
@@ -96,7 +96,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                 target_key=attn_cls,
             )
 
-        if self.pipeline_stage_manager is None:
+        if self.pipeline_stage_manager is not None:
             self.append_or_create_method_replacement(
                 description={
                     "forward": partial(
@@ -298,7 +298,6 @@ def get_held_layers(self) -> List[Module]:
                 not stage_manager.use_zbv and stage_manager.is_last_stage(ignore_chunk=True)
             ):
                 held_layers.append(module.norm)
-
         else:
             layers_per_stage = stage_manager.distribute_layers(len(module.layers))
             if stage_manager.is_first_stage():
@@ -395,8 +394,8 @@ def get_held_layers(self) -> List[Module]:
         return held_layers
 
     def get_shared_params(self) -> List[Dict[int, Tensor]]:
-        if self.pipeline_stage_manager is not None and self.pipeline_stage_manager.use_zbv:
-            return []
+        # if self.pipeline_stage_manager is not None and self.pipeline_stage_manager.use_zbv:
+        #     return []
         llama_model = self.model.model
         if self.pipeline_stage_manager and self.pipeline_stage_manager.num_stages > 1:
             if (
@@ -404,12 +403,20 @@ def get_shared_params(self) -> List[Dict[int, Tensor]]:
                 and self.pipeline_stage_manager.num_stages > 1
             ):
                 # tie weights
-                return [
-                    {
-                        0: llama_model.embed_tokens.weight,
-                        self.pipeline_stage_manager.num_stages - 1: self.model.lm_head.weight,
-                    }
-                ]
+                if self.pipeline_stage_manager.use_zbv:
+                    return [
+                        {
+                            0: llama_model.embed_tokens.weight,
+                            0: self.model.lm_head.weight,
+                        }
+                    ]
+                else:
+                    return [
+                        {
+                            0: llama_model.embed_tokens.weight,
+                            self.pipeline_stage_manager.num_stages - 1: self.model.lm_head.weight,
+                        }
+                    ]
         return []
 
 
diff --git a/examples/language/llama/benchmark.py b/examples/language/llama/benchmark.py
index 041c51fb19fb..ff21bde414db 100644
--- a/examples/language/llama/benchmark.py
+++ b/examples/language/llama/benchmark.py
@@ -40,6 +40,7 @@
     ),
     "5b": LlamaConfig(max_position_embeddings=4096, num_key_value_heads=8),
     "7b": LlamaConfig(max_position_embeddings=4096),
+    # "7b": LlamaConfig(num_hidden_layers=4, max_position_embeddings=4096),
     "13b": LlamaConfig(
         hidden_size=5120,
         intermediate_size=13824,
@@ -127,9 +128,12 @@ def empty_init():
         {
             "gradient_checkpoint_config": PipelineGradientCheckpointConfig(
                 num_ckpt_layers_per_stage=[19, 19, 19, 13],
+                # num_ckpt_layers_per_stage=[48, 48, 48, 48],
             ),
             "num_layers_per_stage": [19, 20, 20, 21],
-            "pp_style": "interleaved",
+            # "num_layers_per_stage": [48, 48, 48, 48],
+            # "pp_style": "interleaved",
+            "pp_style": "1f1b",
         }
         if args.custom_ckpt
         else {}
@@ -227,12 +231,14 @@ def empty_init():
                 b_cost=1000,
                 w_cost=1000,
                 c_cost=1,
-                f_mem=mem_f,
-                b_mem=mem_b,
-                w_mem=mem_w,
+                f_mem=mem_f * 1.5,
+                b_mem=mem_b * 1.5,
+                w_mem=mem_w * 1.5,
             ).get_v_schedule()
         else:
             scheduler_nodes = None
+        # print(f"{dist.get_rank()} {scheduler_nodes[]} ")
+
         plugin = HybridParallelPlugin(
             tp_size=args.tp,
             pp_size=args.pp,
@@ -267,7 +273,7 @@ def empty_init():
             microbatch_size=args.mbs,
             initial_scale=2**8,
             precision="bf16",
-            overlap_p2p=args.overlap,
+            overlap_p2p=True,
             use_fp8=args.use_fp8,
             fp8_communication=args.use_fp8_comm,
         )
@@ -328,7 +334,7 @@ def empty_init():
     torch.set_default_dtype(torch.bfloat16)
     model, optimizer, _, dataloader, _ = booster.boost(model, optimizer, dataloader=dataloader)
 
-    torch.set_default_dtype(torch.float)
+    # torch.set_default_dtype(torch.float)
     coordinator.print_on_master(
         f"Booster init max CUDA memory: {get_accelerator().max_memory_allocated()/1024**2:.2f} MB"
     )
@@ -340,7 +346,7 @@ def empty_init():
         args.profile,
         args.ignore_steps,
         1,  # avoid creating massive log files
-        save_dir=f"profile/{time.strftime('%H:%M', time.localtime())}-{args.plugin}-llama-{args.config}",
+        save_dir=f"./profile/{time.strftime('%H:%M', time.localtime())}-{args.plugin}-llama-{args.config}",
         nsys=args.nsys,
     ) as prof:
         if isinstance(plugin, HybridParallelPlugin) and args.pp > 1:
diff --git a/examples/language/performance_evaluator.py b/examples/language/performance_evaluator.py
index 65c7e49a2f03..4bebf6d037a2 100644
--- a/examples/language/performance_evaluator.py
+++ b/examples/language/performance_evaluator.py
@@ -21,11 +21,16 @@ def divide(x: float, y: float) -> float:
 def all_reduce_mean(x: float, world_size: int) -> float:
     if world_size == 1:
         return x
-
-    # Use CPU tensor to avoid OOM/weird NCCl error
-    gloo_group = dist.new_group(backend="gloo")
-    tensor = torch.tensor([x], device="cpu")
-    dist.all_reduce(tensor, group=gloo_group)
+    # BUG: RuntimeError: Invalid scalar type when use dist.all_reduce(tensor, group=gloo_group)
+    # # Use CPU tensor to avoid OOM/weird NCCl error
+    # gloo_group = dist.new_group(backend="gloo")
+    # tensor = torch.tensor([x], device="cpu")
+    # dist.all_reduce(tensor, group=gloo_group)
+    # tensor = tensor / world_size
+    # return tensor.item()
+
+    tensor = torch.tensor([x], device=torch.cuda.current_device(), dtype=torch.float)
+    dist.all_reduce(tensor)
     tensor = tensor / world_size
     return tensor.item()
 
diff --git a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
index ffeaf6bd8b19..71ae2f30b0a8 100644
--- a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
+++ b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
@@ -758,11 +758,11 @@ def run_with_hybridplugin(test_config):
 @parameterize(
     "config",
     [
-        (0, 1, 4, 1, 1),
-        (1, 2, 2, 1, 1),
+        # (0, 1, 4, 1, 1),
+        # (1, 2, 2, 1, 1),
         (1, 1, 2, 2, 1),
-        (1, 2, 1, 2, 1),
-        (1, 2, 1, 1, 2),
+        # (1, 2, 1, 2, 1),
+        # (1, 2, 1, 1, 2),
     ],
 )
 def run_with_booster_moehybridplugin(config: Tuple[int, ...]):
@@ -923,10 +923,10 @@ def run_with_booster_moehybridplugin(config: Tuple[int, ...]):
 @parameterize(
     "config",
     [
-        (0, 4, 1, 1),
+        # (0, 4, 1, 1),
         (1, 2, 2, 1),
-        (1, 2, 1, 2),
-        (1, 1, 2, 2),
+        # (1, 2, 1, 2),
+        # (1, 1, 2, 2), # TODO: no pp show gather result err
     ],
 )
 def run_with_booster_hybridplugin(config: Tuple[int, ...]):
@@ -976,7 +976,7 @@ def run_with_booster_hybridplugin(config: Tuple[int, ...]):
 
     zbv_schedule = graph.get_v_schedule()
 
-    # init MoeHybridPlugin
+    # init HybridParallelPlugin
     plugin = HybridParallelPlugin(
         pp_size=pp_size,
         num_microbatches=pp_size,

From 89a9a600bc4802c912b0ed48d48f70bbcdd8142b Mon Sep 17 00:00:00 2001
From: Tong Li <tong.li352711588@gmail.com>
Date: Thu, 24 Oct 2024 17:51:19 +0800
Subject: [PATCH 151/167] [MCTS] Add self-refined MCTS (#6098)

* add reasoner

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update code

* delete llama

* update prompts

* update readme

* update readme

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 applications/ColossalChat/README.md           |  21 +-
 .../coati/reasoner/guided_search/llm.py       |  26 ++
 .../coati/reasoner/guided_search/mcts.py      | 250 ++++++++++++++++++
 .../guided_search/prompt_store/base.py        |  10 +
 .../guided_search/prompt_store/qwen.py        |  20 ++
 5 files changed, 324 insertions(+), 3 deletions(-)
 create mode 100644 applications/ColossalChat/coati/reasoner/guided_search/llm.py
 create mode 100644 applications/ColossalChat/coati/reasoner/guided_search/mcts.py
 create mode 100644 applications/ColossalChat/coati/reasoner/guided_search/prompt_store/base.py
 create mode 100644 applications/ColossalChat/coati/reasoner/guided_search/prompt_store/qwen.py

diff --git a/applications/ColossalChat/README.md b/applications/ColossalChat/README.md
index 100cc5ece9c3..ef904b864a14 100755
--- a/applications/ColossalChat/README.md
+++ b/applications/ColossalChat/README.md
@@ -27,11 +27,11 @@
 - [Alternative Option For RLHF: SimPO](#alternative-option-for-rlhf-simple-preference-optimization-simpo)
 - [Alternative Option For RLHF: ORPO](#alternative-option-for-rlhf-odds-ratio-preference-optimization-orpo)
 - [Alternative Option For RLHF: KTO](#alternative-option-for-rlhf-kahneman-tversky-optimization-kto)
+- [O1 Journey](#o1-journey)
+  - [Inference with Self-refined MCTS](#inference-with-self-refined-mcts)
 - [FAQ](#faq)
   - [How to save/load checkpoint](#faq)
   - [How to train with limited resources](#faq)
-- [The Plan](#the-plan)
-  - [Real-time progress](#real-time-progress)
 - [Invitation to open-source contribution](#invitation-to-open-source-contribution)
 - [Quick Preview](#quick-preview)
 - [Authors](#authors)
@@ -272,7 +272,7 @@ Odds Ratio Preference Optimization (ORPO) from this [paper](https://arxiv.org/pd
 ## Alternative Option For RLHF: Kahneman-Tversky Optimization (KTO)
 We support the method introduced in the paper [KTO:Model Alignment as Prospect Theoretic Optimization](https://arxiv.org/pdf/2402.01306) (KTO). Which is a aligment method that directly maximize "human utility" of generation results. Read this [README](./examples/README.md) for more information.
 
-### Inference Quantization and Serving - After Training
+## Inference Quantization and Serving - After Training
 
 We provide an online inference server and a benchmark. We aim to run inference on single GPU, so quantization is essential when using large models.
 
@@ -281,6 +281,21 @@ We support 8-bit quantization (RTN), 4-bit quantization (GPTQ), and FP16 inferen
 Online inference server scripts can help you deploy your own services.
 For more details, see [`inference/`](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/inference).
 
+## O1 Journey
+### Inference with Self-refined MCTS
+We provide the implementation of MCT Self-Refine (MCTSr) algorithm, an innovative integration of Large Language Models with Monte Carlo Tree Search.
+To run inference with MCTS, simply use the following script.
+```python
+from coati.reasoner.guided_search.mcts import MCTS
+from coati.reasoner.guided_search.prompt_store.qwen import Qwen32B_prompt_CFG
+
+problem = "How Many R in 'Strawberry'"
+
+search_tree = MCTS(problem=problem, max_simulations=8, cfg=Qwen32B_prompt_CFG)
+answer = search_tree.simulate()
+print(answer)
+```
+
 ## Coati7B examples
 
 ### Generation
diff --git a/applications/ColossalChat/coati/reasoner/guided_search/llm.py b/applications/ColossalChat/coati/reasoner/guided_search/llm.py
new file mode 100644
index 000000000000..5025a98ea741
--- /dev/null
+++ b/applications/ColossalChat/coati/reasoner/guided_search/llm.py
@@ -0,0 +1,26 @@
+import openai
+from openai.types.chat.chat_completion import ChatCompletion
+from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam
+
+API_KEY = "Dummy API Key"
+
+
+def get_client(base_url: str | None = None) -> openai.Client:
+    return openai.Client(api_key=API_KEY, base_url=base_url)
+
+
+def chat_completion(
+    messages: list[ChatCompletionMessageParam],
+    model: str,
+    base_url: str | None = None,
+    temperature: float = 0.8,
+    **kwargs,
+) -> ChatCompletion:
+    client = get_client(base_url)
+    response = client.chat.completions.create(
+        model=model,
+        messages=messages,
+        temperature=temperature,
+        **kwargs,
+    )
+    return response
diff --git a/applications/ColossalChat/coati/reasoner/guided_search/mcts.py b/applications/ColossalChat/coati/reasoner/guided_search/mcts.py
new file mode 100644
index 000000000000..693e2b750539
--- /dev/null
+++ b/applications/ColossalChat/coati/reasoner/guided_search/mcts.py
@@ -0,0 +1,250 @@
+"""
+Implementation of MCTS + Self-refine algorithm.
+
+Reference:
+1. "Accessing GPT-4 level Mathematical Olympiad Solutions via Monte
+Carlo Tree Self-refine with LLaMa-3 8B: A Technical Report"
+2. https://github.com/BrendanGraham14/mcts-llm/
+3. https://github.com/trotsky1997/MathBlackBox/
+4. https://github.com/openreasoner/openr/blob/main/reason/guided_search/tree.py
+"""
+
+from __future__ import annotations
+
+import math
+from collections import deque
+
+import numpy as np
+import tqdm
+from coati.reasoner.guided_search.llm import chat_completion
+from coati.reasoner.guided_search.prompt_store.base import PromptCFG
+from pydantic import BaseModel
+
+
+class MCTSNode(BaseModel):
+    """
+    Node for MCTS.
+    """
+
+    answer: str
+    parent: MCTSNode = None
+    children: list[MCTSNode] = []
+    num_visits: int = 0
+    Q: int = 0
+    rewards: list[int] = []
+
+    def expand_node(self, node) -> None:
+        self.children.append(node)
+
+    def add_reward(self, reward: int) -> None:
+        self.rewards.append(reward)
+        self.Q = (np.min(self.rewards) + np.mean(self.rewards)) / 2
+
+
+class MCTS(BaseModel):
+    """
+    Simulation of MCTS process.
+    """
+
+    problem: str
+    max_simulations: int
+    cfg: PromptCFG
+    C: float = 1.4
+    max_children: int = 2
+    epsilon: float = 1e-5
+    root: MCTSNode = None
+
+    def initialization(self):
+        """
+        Root Initiation.
+        """
+        # Dummy answer as root.
+        base_answer = self.sample_base_answer()
+        self.root = MCTSNode(answer=base_answer)
+        self.self_evaluate(self.root)
+
+    def is_fully_expanded(self, node: MCTSNode):
+        return len(node.children) >= self.max_children or any(child.Q > node.Q for child in node.children)
+
+    def select_node(self) -> MCTSNode:
+        """
+        Select next node to explore.
+        """
+        candidates: list[MCTSNode] = []
+        to_explore = deque([self.root])
+
+        while to_explore:
+            current_node = to_explore.popleft()
+            if not self.is_fully_expanded(current_node):
+                candidates.append(current_node)
+            to_explore.extend(current_node.children)
+
+        if not candidates:
+            return self.root
+
+        return max(candidates, key=self.compute_uct)
+
+    def self_evaluate(self, node: MCTSNode):
+        """
+        Sample reward of the answer.
+        """
+        reward = self.sample_reward(node)
+        node.add_reward(reward)
+
+    def back_propagation(self, node: MCTSNode):
+        """
+        Back propagate the value of the refined answer.
+        """
+        parent = node.parent
+        while parent:
+            best_child_Q = max(child.Q for child in parent.children)
+            parent.Q = (parent.Q + best_child_Q) / 2
+            parent.num_visits += 1
+            parent = parent.parent
+
+    def compute_uct(self, node: MCTSNode):
+        """
+        Compute UCT.
+        """
+        if node.parent is None:
+            return -100
+        return node.Q + self.C * math.sqrt(math.log(node.parent.num_visits + 1) / (node.num_visits + self.epsilon))
+
+    def simulate(self):
+        self.initialization()
+        for _ in tqdm.tqdm(range(self.max_simulations)):
+            node = self.select_node()
+            child = self.self_refine(node)
+            node.expand_node(child)
+            self.self_evaluate(child)
+            self.back_propagation(child)
+
+        return self.get_best_answer()
+
+    def get_best_answer(self):
+        to_visit = deque([self.root])
+        best_node = self.root
+
+        while to_visit:
+            current_node = to_visit.popleft()
+            if current_node.Q > best_node.Q:
+                best_node = current_node
+            to_visit.extend(current_node.children)
+
+        return best_node.answer
+
+    def self_refine(self, node: MCTSNode):
+        """
+        Refine node.
+        """
+        critique_response = chat_completion(
+            messages=[
+                {
+                    "role": "system",
+                    "content": self.cfg.critic_system_prompt,
+                },
+                {
+                    "role": "user",
+                    "content": "\n\n".join(
+                        [
+                            f"<problem>\n{self.problem}\n</problem>",
+                            f"<current_answer>\n{node.answer}\n</current_answer>",
+                        ]
+                    ),
+                },
+            ],
+            model=self.cfg.model,
+            base_url=self.cfg.base_url,
+            max_tokens=self.cfg.max_tokens,
+        )
+        critique = critique_response.choices[0].message.content
+        assert critique is not None
+        refined_answer_response = chat_completion(
+            messages=[
+                {
+                    "role": "system",
+                    "content": self.cfg.refine_system_prompt,
+                },
+                {
+                    "role": "user",
+                    "content": "\n\n".join(
+                        [
+                            f"<problem>\n{self.problem}\n</problem>",
+                            f"<current_answer>\n{node.answer}\n</current_answer>",
+                            f"<critique>\n{critique}\n</critique>",
+                        ]
+                    ),
+                },
+            ],
+            model=self.cfg.model,
+            base_url=self.cfg.base_url,
+            max_tokens=self.cfg.max_tokens,
+        )
+        refined_answer = refined_answer_response.choices[0].message.content
+        assert refined_answer is not None
+
+        return MCTSNode(answer=refined_answer, parent=node)
+
+    def sample_base_answer(self):
+        response = chat_completion(
+            messages=[
+                {
+                    "role": "system",
+                    "content": "The user will provide a problem. Solve the problem. The response should begin with [reasoning process]...[Verification]... and end with [Final Answer]. \nThe answer is [answer] \n#### [answer].",
+                },
+                {
+                    "role": "user",
+                    "content": f"<problem>\n {self.problem} \n</problem> \nLet's think step by step",
+                },
+            ],
+            model=self.cfg.model,
+            base_url=self.cfg.base_url,
+            max_tokens=self.cfg.max_tokens,
+        )
+        assert response.choices[0].message.content is not None
+        return response.choices[0].message.content
+
+    def sample_reward(self, node: MCTSNode):
+        """
+        Calculate reward.
+        """
+        messages = [
+            {
+                "role": "system",
+                "content": self.cfg.evaluate_system_prompt,
+            },
+            {
+                "role": "user",
+                "content": "\n\n".join(
+                    [
+                        f"<problem>\n{self.problem}\n</problem>",
+                        f"<answer>\n{node.answer}\n</answer>",
+                    ]
+                ),
+            },
+        ]
+        for attempt in range(3):
+            try:
+                response = chat_completion(
+                    messages=messages,
+                    model=self.cfg.model,
+                    base_url=self.cfg.base_url,
+                    max_tokens=self.cfg.max_tokens,
+                )
+                assert response.choices[0].message.content is not None
+                return int(response.choices[0].message.content)
+            except ValueError:
+                messages.extend(
+                    [
+                        {
+                            "role": "assistant",
+                            "content": response.choices[0].message.content,
+                        },
+                        {
+                            "role": "user",
+                            "content": "Failed to parse reward as an integer.",
+                        },
+                    ]
+                )
+                if attempt == 2:
+                    raise
diff --git a/applications/ColossalChat/coati/reasoner/guided_search/prompt_store/base.py b/applications/ColossalChat/coati/reasoner/guided_search/prompt_store/base.py
new file mode 100644
index 000000000000..b325b8fa2381
--- /dev/null
+++ b/applications/ColossalChat/coati/reasoner/guided_search/prompt_store/base.py
@@ -0,0 +1,10 @@
+from pydantic import BaseModel
+
+
+class PromptCFG(BaseModel):
+    model: str
+    base_url: str
+    max_tokens: int = 4096
+    critic_system_prompt: str
+    refine_system_prompt: str
+    evaluate_system_prompt: str
diff --git a/applications/ColossalChat/coati/reasoner/guided_search/prompt_store/qwen.py b/applications/ColossalChat/coati/reasoner/guided_search/prompt_store/qwen.py
new file mode 100644
index 000000000000..8bf0fa959da9
--- /dev/null
+++ b/applications/ColossalChat/coati/reasoner/guided_search/prompt_store/qwen.py
@@ -0,0 +1,20 @@
+"""
+Prompts for Qwen Series.
+"""
+
+from coati.reasoner.guided_search.prompt_store.base import PromptCFG
+
+Qwen32B_prompt_CFG = PromptCFG(
+    base_url="http://0.0.0.0:8008/v1",
+    model="Qwen2.5-32B-Instruct",
+    critic_system_prompt="Provide a detailed and constructive critique to improve the answer. "
+    "Highlight specific areas that need refinement or correction.",
+    refine_system_prompt="""# Instruction
+                            Refine the answer based on the critique. The response should begin with [reasoning process]...[Verification]... and end with [Final Answer].
+                         """,
+    evaluate_system_prompt=(
+        "Analyze this answer strictly and critic, provide a reward score between -100 and 100 for the answer quality, using very strict standards. "
+        "Do not give a full score above 95. Make sure the reward score is an integer. "
+        "Return *ONLY* the score."
+    ),
+)

From d0ec221b3853ccefb2f1133b5fae2dc50fed7430 Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Fri, 25 Oct 2024 02:28:55 +0000
Subject: [PATCH 152/167] [fix\ fix fail case test_shard_llama

---
 colossalai/pipeline/schedule/zero_bubble_pp.py          | 2 +-
 colossalai/pipeline/stage_manager.py                    | 1 -
 colossalai/shardformer/modeling/llama.py                | 7 -------
 examples/language/llama/benchmark.py                    | 5 +++++
 tests/test_pipeline/test_schedule/test_zerobubble_pp.py | 7 ++++---
 5 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/colossalai/pipeline/schedule/zero_bubble_pp.py b/colossalai/pipeline/schedule/zero_bubble_pp.py
index 408cdffc22a2..c22dce7da1de 100644
--- a/colossalai/pipeline/schedule/zero_bubble_pp.py
+++ b/colossalai/pipeline/schedule/zero_bubble_pp.py
@@ -3,6 +3,7 @@
 
 import torch
 import torch.cuda
+import torch.distributed
 from torch.nn import Module, ModuleList
 from torch.utils._pytree import tree_flatten, tree_map
 
@@ -544,7 +545,6 @@ def backward_b_step(
             ctx = optimizer.no_sync()
         except AttributeError:
             ctx = model_chunk.no_sync()
-
         with ctx:
             optimizer.backward_by_grad(
                 tensor=output_obj_,
diff --git a/colossalai/pipeline/stage_manager.py b/colossalai/pipeline/stage_manager.py
index f30ab8e59964..8ef394ec3585 100644
--- a/colossalai/pipeline/stage_manager.py
+++ b/colossalai/pipeline/stage_manager.py
@@ -228,5 +228,4 @@ def distribute_layers(
             start_position = (num_stages * num_model_chunks) // 2 - remainder // 2
             for i in range(start_position, start_position + remainder):
                 layers_per_stage[i] += 1
-        # print(f"layers_per_stage {layers_per_stage}")
         return layers_per_stage
diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index a02db1168b8c..7a04c5451cfc 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -32,7 +32,6 @@
 from ..layer import ColoAttention, RingAttention, dist_cross_entropy
 
 _SUPPORTED_SP_MODE = ["all_to_all", "split_gather", "ring", "ring_attn"]
-_GLOBAL_ORDER_ = 0
 
 
 class LlamaPipelineForwards:
@@ -194,10 +193,6 @@ def llama_model_forward(
             assert num_ckpt_layers <= end_idx - start_idx
 
         for idx, decoder_layer in enumerate(self.layers[start_idx:end_idx], start=start_idx):
-            # global _GLOBAL_ORDER_
-            # if torch.distributed.get_rank() == 0:
-            #     print(f"rank {torch.distributed.get_rank()} {stage_manager.stage}; start:{start_idx}, end:{end_idx} hidden_states require grad{hidden_states.requires_grad}")
-            # # _GLOBAL_ORDER_ += 1
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
             if idx - start_idx < num_ckpt_layers:
@@ -221,8 +216,6 @@ def llama_model_forward(
                     use_cache=use_cache,
                     cache_position=cache_position,
                 )
-                # if torch.distributed.get_rank() == 0:
-                #     print(f"rank {torch.distributed.get_rank()} {stage_manager.stage}; start:{start_idx}, end:{end_idx} layer_outputs require grad {layer_outputs[0].requires_grad}")
             hidden_states = layer_outputs[0]
 
             if use_cache:
diff --git a/examples/language/llama/benchmark.py b/examples/language/llama/benchmark.py
index ff21bde414db..0d80bc2254c9 100644
--- a/examples/language/llama/benchmark.py
+++ b/examples/language/llama/benchmark.py
@@ -287,6 +287,11 @@ def empty_init():
     # ==============================
     dp_size = getattr(plugin, "dp_size", coordinator.world_size)
 
+    if args.config in MODEL_CONFIGS:
+        config = MODEL_CONFIGS[args.config]
+    else:
+        config = AutoConfig.from_pretrained(args.config, trust_remote_code=True)
+
     torch.cuda.manual_seed(42)
     dataset = RandomDataset(
         num_samples=args.batch_size * args.num_steps * dp_size, max_length=args.max_length, vocab_size=config.vocab_size
diff --git a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
index 71ae2f30b0a8..5f286d173cd2 100644
--- a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
+++ b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
@@ -923,10 +923,11 @@ def run_with_booster_moehybridplugin(config: Tuple[int, ...]):
 @parameterize(
     "config",
     [
-        # (0, 4, 1, 1),
-        (1, 2, 2, 1),
+        # (1, 2, 2, 1), # Pass
+        # TODO: only support pp + tp accleration; Will support fully pp and None tp Hybrid in furture;
+        (0, 4, 1, 1),
         # (1, 2, 1, 2),
-        # (1, 1, 2, 2), # TODO: no pp show gather result err
+        # (1, 1, 2, 2),
     ],
 )
 def run_with_booster_hybridplugin(config: Tuple[int, ...]):

From cc0dfddcbc8ec09033583b870c35901aabf44a4e Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Fri, 25 Oct 2024 09:01:13 +0000
Subject: [PATCH 153/167] [fix] fix test_shard_llama

---
 colossalai/shardformer/policies/llama.py      | 38 +++++++++++--------
 examples/language/llama/benchmark.py          |  1 -
 .../test_schedule/test_zerobubble_pp.py       |  4 +-
 3 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/colossalai/shardformer/policies/llama.py b/colossalai/shardformer/policies/llama.py
index 8a980bf9d621..28ac2dc7f843 100644
--- a/colossalai/shardformer/policies/llama.py
+++ b/colossalai/shardformer/policies/llama.py
@@ -394,8 +394,8 @@ def get_held_layers(self) -> List[Module]:
         return held_layers
 
     def get_shared_params(self) -> List[Dict[int, Tensor]]:
-        # if self.pipeline_stage_manager is not None and self.pipeline_stage_manager.use_zbv:
-        #     return []
+        if self.pipeline_stage_manager is not None and self.pipeline_stage_manager.use_zbv:
+            return []
         llama_model = self.model.model
         if self.pipeline_stage_manager and self.pipeline_stage_manager.num_stages > 1:
             if (
@@ -403,20 +403,26 @@ def get_shared_params(self) -> List[Dict[int, Tensor]]:
                 and self.pipeline_stage_manager.num_stages > 1
             ):
                 # tie weights
-                if self.pipeline_stage_manager.use_zbv:
-                    return [
-                        {
-                            0: llama_model.embed_tokens.weight,
-                            0: self.model.lm_head.weight,
-                        }
-                    ]
-                else:
-                    return [
-                        {
-                            0: llama_model.embed_tokens.weight,
-                            self.pipeline_stage_manager.num_stages - 1: self.model.lm_head.weight,
-                        }
-                    ]
+                return [
+                    {
+                        0: llama_model.embed_tokens.weight,
+                        self.pipeline_stage_manager.num_stages - 1: self.model.lm_head.weight,
+                    }
+                ]
+                # if self.pipeline_stage_manager.use_zbv:
+                #     return [
+                #         {
+                #             0: llama_model.embed_tokens.weight,
+                #             0: self.model.lm_head.weight,
+                #         }
+                #     ]
+                # else:
+                #     return [
+                #         {
+                #             0: llama_model.embed_tokens.weight,
+                #             self.pipeline_stage_manager.num_stages - 1: self.model.lm_head.weight,
+                #         }
+                #     ]
         return []
 
 
diff --git a/examples/language/llama/benchmark.py b/examples/language/llama/benchmark.py
index 0d80bc2254c9..b60bdd03e705 100644
--- a/examples/language/llama/benchmark.py
+++ b/examples/language/llama/benchmark.py
@@ -237,7 +237,6 @@ def empty_init():
             ).get_v_schedule()
         else:
             scheduler_nodes = None
-        # print(f"{dist.get_rank()} {scheduler_nodes[]} ")
 
         plugin = HybridParallelPlugin(
             tp_size=args.tp,
diff --git a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
index 5f286d173cd2..c485d3f5430c 100644
--- a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
+++ b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
@@ -923,9 +923,9 @@ def run_with_booster_moehybridplugin(config: Tuple[int, ...]):
 @parameterize(
     "config",
     [
-        # (1, 2, 2, 1), # Pass
+        (1, 2, 2, 1),  # Pass
         # TODO: only support pp + tp accleration; Will support fully pp and None tp Hybrid in furture;
-        (0, 4, 1, 1),
+        # (0, 4, 1, 1),
         # (1, 2, 1, 2),
         # (1, 1, 2, 2),
     ],

From 03fa79a55c327d411ed1f7af7c3fc88007708d60 Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Fri, 25 Oct 2024 10:17:06 +0000
Subject: [PATCH 154/167] [fix] fix llama modeling policy;

---
 colossalai/shardformer/policies/llama.py              | 3 ++-
 tests/test_shardformer/test_model/test_shard_llama.py | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/colossalai/shardformer/policies/llama.py b/colossalai/shardformer/policies/llama.py
index 28ac2dc7f843..bef39a6ca4a7 100644
--- a/colossalai/shardformer/policies/llama.py
+++ b/colossalai/shardformer/policies/llama.py
@@ -96,7 +96,8 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                 target_key=attn_cls,
             )
 
-        if self.pipeline_stage_manager is not None:
+        # if self.pipeline_stage_manager is not None:
+        if self.pipeline_stage_manager is None:
             self.append_or_create_method_replacement(
                 description={
                     "forward": partial(
diff --git a/tests/test_shardformer/test_model/test_shard_llama.py b/tests/test_shardformer/test_model/test_shard_llama.py
index 33707a4f6921..b43e45bcf393 100644
--- a/tests/test_shardformer/test_model/test_shard_llama.py
+++ b/tests/test_shardformer/test_model/test_shard_llama.py
@@ -325,6 +325,7 @@ def run_llama_test(test_config):
         ).get_v_schedule()
         test_config["scheduler_nodes"] = scheduler_nodes
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
+        print(f"name {name}")
         if test_config.get("sequence_parallelism_mode", None) == "ring_attn" and "causal" not in name:
             continue
         try:

From 6377aa0fffb8fbd6862fc2b4ed536724cbe09d64 Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Mon, 28 Oct 2024 02:42:33 +0000
Subject: [PATCH 155/167] [fix] fix test_shard_llama ci;

---
 colossalai/shardformer/modeling/llama.py              | 2 +-
 tests/test_shardformer/test_model/test_shard_llama.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index 7a04c5451cfc..47c17e7494f2 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -82,7 +82,7 @@ def llama_model_forward(
             elif input_ids is not None:
                 batch_size, seq_length = input_ids.shape[:2]
             elif inputs_embeds is not None:
-                batch_size, seq_length = inputs_embeds.shape[:2]
+                batch_size, seq_length, _ = inputs_embeds.shape[:2]
             else:
                 raise ValueError("You have to specify either input_ids or inputs_embeds")
             if inputs_embeds is None:
diff --git a/tests/test_shardformer/test_model/test_shard_llama.py b/tests/test_shardformer/test_model/test_shard_llama.py
index b43e45bcf393..33707a4f6921 100644
--- a/tests/test_shardformer/test_model/test_shard_llama.py
+++ b/tests/test_shardformer/test_model/test_shard_llama.py
@@ -325,7 +325,6 @@ def run_llama_test(test_config):
         ).get_v_schedule()
         test_config["scheduler_nodes"] = scheduler_nodes
     for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
-        print(f"name {name}")
         if test_config.get("sequence_parallelism_mode", None) == "ring_attn" and "causal" not in name:
             continue
         try:

From 5aee4261a60586b7cf5eda3992f247ff5569aedc Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Mon, 28 Oct 2024 06:06:07 +0000
Subject: [PATCH 156/167] [fix] fix test zerobubble

---
 colossalai/shardformer/modeling/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py
index 47c17e7494f2..7a04c5451cfc 100644
--- a/colossalai/shardformer/modeling/llama.py
+++ b/colossalai/shardformer/modeling/llama.py
@@ -82,7 +82,7 @@ def llama_model_forward(
             elif input_ids is not None:
                 batch_size, seq_length = input_ids.shape[:2]
             elif inputs_embeds is not None:
-                batch_size, seq_length, _ = inputs_embeds.shape[:2]
+                batch_size, seq_length = inputs_embeds.shape[:2]
             else:
                 raise ValueError("You have to specify either input_ids or inputs_embeds")
             if inputs_embeds is None:

From fafe049b83bad3a6aa6e3a31c68b38ac63167b53 Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Tue, 29 Oct 2024 03:24:15 +0000
Subject: [PATCH 157/167] [fix] fix handle name; rm useless comments;

---
 .../pipeline/schedule/zero_bubble_pp.py       |  7 ++-
 colossalai/pipeline/weight_grad_store.py      | 51 -------------------
 colossalai/shardformer/policies/llama.py      | 20 +-------
 .../test_schedule/test_zerobubble_pp.py       |  4 --
 4 files changed, 4 insertions(+), 78 deletions(-)

diff --git a/colossalai/pipeline/schedule/zero_bubble_pp.py b/colossalai/pipeline/schedule/zero_bubble_pp.py
index c22dce7da1de..638b601d4414 100644
--- a/colossalai/pipeline/schedule/zero_bubble_pp.py
+++ b/colossalai/pipeline/schedule/zero_bubble_pp.py
@@ -107,7 +107,7 @@ def _free_buffers(self):
         self.local_send_backward_buffer = []
 
         # wait pp buffer
-        self.send_handles = []
+        self.wait_handles = []
 
     def assert_buffer_empty(self):
         # assert buffer is empty at end
@@ -129,7 +129,6 @@ def assert_buffer_empty(self):
         assert len(self.recv_backward_buffer[1]) == 0
         assert len(self.local_send_forward_buffer) == 0
         assert len(self.local_send_backward_buffer) == 0
-        # assert len(self.send_handles) == 0
 
     def load_batch(self, data_iter: Iterable, device: Optional[torch.device] = None) -> None:
         """Load a batch from data iterator.
@@ -891,7 +890,7 @@ def run_forward_backward(
                 # communication
                 communication_func = self.communication_map[scheduled_node.type]
                 wait_handle = communication_func(scheduled_node.chunk)
-                self.send_handles.append(wait_handle)
+                self.wait_handles.append(wait_handle)
             elif scheduled_node.type == "F":
                 self.schedule_f(
                     scheduled_node=scheduled_node,
@@ -915,7 +914,7 @@ def run_forward_backward(
                     model_chunk_id=scheduled_node.chunk,
                     optimizer=optimizer,
                 )
-        for h in self.send_handles:
+        for h in self.wait_handles:
             for hh in h:
                 hh.wait()
 
diff --git a/colossalai/pipeline/weight_grad_store.py b/colossalai/pipeline/weight_grad_store.py
index dff4fdd0200e..c51c45085ea2 100644
--- a/colossalai/pipeline/weight_grad_store.py
+++ b/colossalai/pipeline/weight_grad_store.py
@@ -1,7 +1,5 @@
 import queue
 
-from colossalai.pipeline.stage_manager import PipelineStageManager
-
 
 class WeightGradStore:
 
@@ -32,52 +30,3 @@ def pop(cls, chunk=0):
                     weight.grad = grad_weight
         else:
             raise Exception("Pop empty queue.")
-
-    @classmethod
-    def clear(cls, stage_manager: PipelineStageManager, chunk=0):
-        pass
-        # print(f"stage {stage_manager.stage} len_chunk_0 {cls.weight_grad_queue[0].qsize()} len_chunk_1 {cls.weight_grad_queue[1].qsize()}")
-        # while cls.weight_grad_queue[chunk].qsize() > 0:
-        #     stored_grads = cls.weight_grad_queue[chunk].get()
-        #     for total_input, grad_output, weight, func in stored_grads:
-        #         if weight.grad is not None:
-        #             func(total_input, grad_output, weight.grad)
-        #         # for first bwd; weight.grad is None, assign grad_weight to weight.grad
-        #         else:
-        #             grad_weight = func(total_input, grad_output)
-        #             weight.grad = grad_weight
-
-        # weight_grad_tasks = []
-        # while cls.weight_grad_queue[chunk].qsize() > 0:
-        #     stored_grads = cls.weight_grad_queue[chunk].get()
-        #     if len(weight_grad_tasks) == 0:
-        #         for _ in stored_grads:
-        #             weight_grad_tasks.append([])
-        #     else:
-        #         assert len(weight_grad_tasks) == len(stored_grads)
-        #     for i, task in enumerate(stored_grads):
-        #         weight_grad_tasks[i].append(task)
-
-        # if stage_manager.is_last_stage(ignore_chunk=True) and chunk == 1:
-        #     assert len(weight_grad_tasks) > 0
-        #     output_layer_grads = weight_grad_tasks[0]
-        #     for j in range(len(output_layer_grads)):
-        #         total_input, grad_output, weight, func = output_layer_grads[j]
-        #         if output_layer_weight is None:
-        #             output_layer_weight = weight
-        #         assert output_layer_weight is weight
-        #         func(total_input, grad_output, weight.grad)
-        #         output_layer_grads[j] = None  # release memory
-        #     weight_grad_tasks = weight_grad_tasks[1:]
-
-        # for i in range(len(weight_grad_tasks)):
-        #     tasks = weight_grad_tasks[i]
-        #     param = None
-        #     for j in range(len(tasks)):
-        #         total_input, grad_output, weight, func = tasks[j]
-        #         if param is None:
-        #             param = weight
-        #         assert param is weight
-        #         func(total_input, grad_output, weight.grad)
-        #         tasks[j] = None  # release memory
-        #     weight_grad_tasks[i] = None  # release memory
diff --git a/colossalai/shardformer/policies/llama.py b/colossalai/shardformer/policies/llama.py
index bef39a6ca4a7..756d32454233 100644
--- a/colossalai/shardformer/policies/llama.py
+++ b/colossalai/shardformer/policies/llama.py
@@ -60,10 +60,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
         else:
             norm_cls = RMSNorm
 
-        if self.pipeline_stage_manager:
-            use_zbv = self.pipeline_stage_manager.use_zbv
-        else:
-            use_zbv = False
+        use_zbv = self.pipeline_stage_manager is not None and self.pipeline_stage_manager.use_zbv
 
         sp_mode = self.shard_config.sequence_parallelism_mode or None
         sp_size = self.shard_config.sequence_parallel_size or None
@@ -96,7 +93,6 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                 target_key=attn_cls,
             )
 
-        # if self.pipeline_stage_manager is not None:
         if self.pipeline_stage_manager is None:
             self.append_or_create_method_replacement(
                 description={
@@ -410,20 +406,6 @@ def get_shared_params(self) -> List[Dict[int, Tensor]]:
                         self.pipeline_stage_manager.num_stages - 1: self.model.lm_head.weight,
                     }
                 ]
-                # if self.pipeline_stage_manager.use_zbv:
-                #     return [
-                #         {
-                #             0: llama_model.embed_tokens.weight,
-                #             0: self.model.lm_head.weight,
-                #         }
-                #     ]
-                # else:
-                #     return [
-                #         {
-                #             0: llama_model.embed_tokens.weight,
-                #             self.pipeline_stage_manager.num_stages - 1: self.model.lm_head.weight,
-                #         }
-                #     ]
         return []
 
 
diff --git a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
index c485d3f5430c..71ff110598a4 100644
--- a/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
+++ b/tests/test_pipeline/test_schedule/test_zerobubble_pp.py
@@ -904,7 +904,6 @@ def run_with_booster_moehybridplugin(config: Tuple[int, ...]):
             torch_output = torch_model(inputs_embeds=input_data_.to(dtype)).last_hidden_state.mean()
             torch_output.backward()
             torch_output_sum += torch_output.detach()
-        # print(f"parallel_output {parallel_output} torch_output_sum {torch_output_sum}")
         # avg dp grads follows zero optimizer
         for p in torch_model.parameters():
             if p.grad is not None:
@@ -912,7 +911,6 @@ def run_with_booster_moehybridplugin(config: Tuple[int, ...]):
         torch_optimizer.step()
         torch_optimizer.zero_grad()
 
-        # print(f"rank {dist.get_rank()} parallel_output {parallel_output} torch_output_sum {torch_output_sum}")
         assert_loose_close(parallel_output, torch_output_sum, dtype=dtype)
         print(f"rank {dist.get_rank()} config {test_config}  test passed")
     clear_layout_converter()
@@ -1064,7 +1062,6 @@ def run_with_booster_hybridplugin(config: Tuple[int, ...]):
             torch_output = torch_model(inputs_embeds=input_data_.to(dtype)).last_hidden_state.mean()
             torch_output.backward()
             torch_output_sum += torch_output.detach()
-        # print(f"parallel_output {parallel_output} torch_output_sum {torch_output_sum}")
         # avg dp grads follows zero optimizer
         for p in torch_model.parameters():
             if p.grad is not None:
@@ -1072,7 +1069,6 @@ def run_with_booster_hybridplugin(config: Tuple[int, ...]):
         torch_optimizer.step()
         torch_optimizer.zero_grad()
 
-        # print(f"rank {dist.get_rank()} parallel_output {parallel_output} torch_output_sum {torch_output_sum}")
         assert_loose_close(parallel_output, torch_output_sum, dtype=dtype)
         print(f"rank {dist.get_rank()} pp_size:{pp_size}, tp_size {tp_size}, sp_size :{sp_size} test passed")
     clear_layout_converter()

From fa3ccda8ee6da5fb5751ff93b5226d757e4a5e79 Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Tue, 29 Oct 2024 03:33:58 +0000
Subject: [PATCH 158/167] [fix] fix send recv signature;

---
 colossalai/pipeline/schedule/zero_bubble_pp.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/colossalai/pipeline/schedule/zero_bubble_pp.py b/colossalai/pipeline/schedule/zero_bubble_pp.py
index 638b601d4414..e310e9bf3254 100644
--- a/colossalai/pipeline/schedule/zero_bubble_pp.py
+++ b/colossalai/pipeline/schedule/zero_bubble_pp.py
@@ -1,5 +1,5 @@
 from functools import partial
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
 
 import torch
 import torch.cuda
@@ -206,7 +206,7 @@ def get_model_chunk_id(self, microbatch_id: int, is_forward: bool) -> int:
             model_chunk_id = self.num_model_chunks - model_chunk_id - 1
         return model_chunk_id
 
-    def recv_forward(self, model_chunk_id: int, prev_rank: int = None) -> Tuple[Any, List]:
+    def recv_forward(self, model_chunk_id: int, prev_rank: int = None) -> List:
         """Copy the forward output from the previous stage in pipeline as the input tensor of this stage.
            For ZBV.
 
@@ -267,7 +267,7 @@ def recv_forward(self, model_chunk_id: int, prev_rank: int = None) -> Tuple[Any,
                     # return input_tensor, wait_handles
                     return wait_handles
 
-    def recv_backward(self, model_chunk_id: int, next_rank: int = None) -> Tuple[Any, List]:
+    def recv_backward(self, model_chunk_id: int, next_rank: int = None) -> List:
         """Copy the gradient tensor from the next stage in pipeline as the input gradient of this stage.
            For ZBV.
 

From 982e4ee1f85a3aec285197d8bbdc3ca292bd15ff Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Tue, 29 Oct 2024 07:35:50 +0000
Subject: [PATCH 159/167] [fix] fix comment in llama & benchmark

---
 colossalai/shardformer/policies/llama.py | 5 +----
 examples/language/llama/benchmark.py     | 6 +++---
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/colossalai/shardformer/policies/llama.py b/colossalai/shardformer/policies/llama.py
index 756d32454233..2b3a30bad3f5 100644
--- a/colossalai/shardformer/policies/llama.py
+++ b/colossalai/shardformer/policies/llama.py
@@ -414,10 +414,7 @@ def module_policy(self):
         from transformers import LlamaForSequenceClassification
 
         policy = super().module_policy()
-        if self.pipeline_stage_manager:
-            use_zbv = self.pipeline_stage_manager.use_zbv
-        else:
-            use_zbv = False
+        use_zbv = self.pipeline_stage_manager is not None and self.pipeline_stage_manager.use_zbv
 
         if self.shard_config.enable_tensor_parallelism:
             # add a new item for sequence classification
diff --git a/examples/language/llama/benchmark.py b/examples/language/llama/benchmark.py
index b60bdd03e705..68ceb9ac1984 100644
--- a/examples/language/llama/benchmark.py
+++ b/examples/language/llama/benchmark.py
@@ -108,6 +108,7 @@ def main():
     parser.add_argument("--no_cache", action="store_true")
     parser.add_argument("--use_fp8_comm", action="store_true", default=False, help="for using fp8 during communication")
     parser.add_argument("--use_fp8", action="store_true", default=False, help="for using fp8 linear")
+    parser.add_argument("--overlap_p2p", action="store_true", default=True, help="for using overlap p2p")
     parser.add_argument("--overlap_allgather", action="store_true")
     parser.add_argument(
         "--sp_mode",
@@ -256,7 +257,6 @@ def empty_init():
             use_fp8=args.use_fp8,
             fp8_communication=args.use_fp8_comm,
             scheduler_nodes=scheduler_nodes,
-            make_vocab_size_divisible_by=1,
             **hybrid_kwargs,
         )
     elif args.plugin == "3d_cpu":
@@ -272,7 +272,7 @@ def empty_init():
             microbatch_size=args.mbs,
             initial_scale=2**8,
             precision="bf16",
-            overlap_p2p=True,
+            overlap_p2p=args.overlap_p2p,
             use_fp8=args.use_fp8,
             fp8_communication=args.use_fp8_comm,
         )
@@ -338,7 +338,7 @@ def empty_init():
     torch.set_default_dtype(torch.bfloat16)
     model, optimizer, _, dataloader, _ = booster.boost(model, optimizer, dataloader=dataloader)
 
-    # torch.set_default_dtype(torch.float)
+    torch.set_default_dtype(torch.float)
     coordinator.print_on_master(
         f"Booster init max CUDA memory: {get_accelerator().max_memory_allocated()/1024**2:.2f} MB"
     )

From d2e05a99b3a776eb0f438d61b74065c9633c7391 Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Wed, 30 Oct 2024 02:54:32 +0000
Subject: [PATCH 160/167] [feat] support no tensor parallel Linear in
 shardformer; Add test for use weightGradStore and not use WeightGradStore

---
 colossalai/shardformer/layer/__init__.py      |   3 +-
 colossalai/shardformer/layer/_operation.py    | 106 +++++++++++-
 colossalai/shardformer/layer/linear.py        | 155 +++++++++++++++++-
 .../test_layer/test_linear_1d.py              |  92 ++++++++++-
 4 files changed, 352 insertions(+), 4 deletions(-)

diff --git a/colossalai/shardformer/layer/__init__.py b/colossalai/shardformer/layer/__init__.py
index 8882a33c15e6..613ce73c3cf2 100644
--- a/colossalai/shardformer/layer/__init__.py
+++ b/colossalai/shardformer/layer/__init__.py
@@ -2,7 +2,7 @@
 from .attn import AttnMaskType, ColoAttention, RingAttention, get_pad_info
 from .dropout import DropoutForParallelInput, DropoutForReplicatedInput
 from .embedding import Embedding1D, PaddingEmbedding, VocabParallelEmbedding1D
-from .linear import Linear1D_Col, Linear1D_Row, PaddingLMHead, VocabParallelLMHead1D
+from .linear import Linear1D, Linear1D_Col, Linear1D_Row, PaddingLMHead, VocabParallelLMHead1D
 from .loss import cross_entropy_1d, dist_cross_entropy
 from .normalization import FusedLayerNorm, FusedRMSNorm, LayerNorm, RMSNorm
 from .parallel_module import ParallelModule
@@ -11,6 +11,7 @@
 __all__ = [
     "Embedding1D",
     "VocabParallelEmbedding1D",
+    "Linear1D",
     "Linear1D_Col",
     "Linear1D_Row",
     "GPT2FusedLinearConv1D_Col",
diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index 4a0800468ed7..46f50ef0279f 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -154,7 +154,6 @@ def execute_w_pass_grad_accum(_input_, _grad_output_, _weight_main_grad_, wgrad_
             wgrad_gemm_accum_func(_input_, _grad_output_, _weight_main_grad_)
 
         def execute_w_pass(_input_, _grad_output_, _weight_main_grad_=None, wgrad_gemm_func=None):
-            # _grad_output_.t().matmul(_input_)
             return wgrad_gemm_func(_grad_output_.t(), _input_)
 
         # In order to be hooked into Gemini's '__torch_function__', adding a view operation to bias.
@@ -236,6 +235,107 @@ def execute_w_pass(_input_, _grad_output_, _weight_main_grad_=None, wgrad_gemm_f
         return grad_input, grad_weight, grad_bias, None, None, None, None
 
 
+class LinearBase(torch.autograd.Function):
+    """
+    Linear layer baseline (no tensor parallel version).
+    """
+
+    @staticmethod
+    def forward(ctx, input_, weight, bias, async_grad_allreduce, fp8_communication=False, use_zbv=False):
+        ctx.save_for_backward(input_, weight, bias)
+        ctx.use_bias = bias is not None
+        ctx.async_grad_allreduce = async_grad_allreduce
+        ctx.fp8_communication = fp8_communication
+        ctx.use_zbv = use_zbv
+        if bias is not None:
+            output = F.linear(input_, weight, bias)
+        else:
+            output = F.linear(input_, weight)
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, weight, bias = ctx.saved_tensors
+        use_bias = ctx.use_bias
+        ctx.fp8_communication
+        use_zbv = ctx.use_zbv
+
+        def execute_w_pass_grad_accum(_input_, _grad_output_, _weight_main_grad_, wgrad_gemm_accum_func=None):
+            wgrad_gemm_accum_func(_input_, _grad_output_, _weight_main_grad_)
+
+        def execute_w_pass(_input_, _grad_output_, _weight_main_grad_=None, wgrad_gemm_func=None):
+            return wgrad_gemm_func(_grad_output_.t(), _input_)
+
+        # In order to be hooked into Gemini's '__torch_function__', adding a view operation to bias.
+        if use_bias:
+            bias.view(bias.shape)
+
+        total_input = input.contiguous()
+        grad_input = grad_output.matmul(weight)
+        grad_output = grad_output.contiguous()
+        # Convert the tensor shapes to 2D for execution compatibility
+        if len(grad_output.shape) > 2:
+            grad_output = grad_output.view(-1, grad_output.shape[-1])
+            total_input = total_input.view(-1, total_input.shape[-1])
+
+        if _grad_accum_fusion_available and weight.grad is not None:
+            grad = weight.grad
+            if use_zbv:
+                # TODO: append input, grad_output_, weight, grad func to WeightGradStore
+                if grad.dtype == torch.float32:
+                    WeightGradStore.put(
+                        total_input,
+                        grad_output,
+                        weight,
+                        functools.partial(
+                            execute_w_pass_grad_accum,
+                            wgrad_gemm_accum_func=fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32,
+                        ),
+                    )
+                    grad_weight = None
+                elif grad.dtype in (torch.float16, torch.bfloat16):
+                    WeightGradStore.put(
+                        total_input,
+                        grad_output,
+                        weight,
+                        functools.partial(
+                            execute_w_pass_grad_accum,
+                            wgrad_gemm_accum_func=fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16,
+                        ),
+                    )
+                    grad_weight = None
+                else:
+                    raise RuntimeError("Unsupported gradient type for gradient accumulation fusion")
+            else:
+                if grad.dtype == torch.float32:
+                    fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp32(total_input, grad_output, grad)
+                    grad_weight = None
+                elif grad.dtype == torch.float16:
+                    fused_weight_gradient_mlp_cuda.wgrad_gemm_accum_fp16(total_input, grad_output, grad)
+                    grad_weight = None
+                else:
+                    grad_weight = grad_output.t().matmul(total_input)
+        else:
+            if use_zbv:
+                WeightGradStore.put(
+                    total_input,
+                    grad_output,
+                    weight,
+                    functools.partial(
+                        execute_w_pass,
+                        wgrad_gemm_func=torch.matmul,
+                    ),
+                )
+                grad_weight = None
+            else:
+                grad_weight = grad_output.t().matmul(total_input)
+
+        grad_bias = grad_output.sum(dim=0) if use_bias else None
+
+        return grad_input, grad_weight, grad_bias, None, None, None, None
+
+
 def _ring_as_gather(func, input_to_gather=None, input_local=None, process_group=None, gather_dim=1, keep_item=False):
     # currently only support one single tensor as output
     group_size = dist.get_world_size(process_group)
@@ -1101,6 +1201,10 @@ def linear_with_async_comm(
     )
 
 
+def linear_base(input_, weight, bias, async_grad_allreduce, fp8_communication=False, use_zbv=False):
+    return LinearBase.apply(input_, weight, bias, async_grad_allreduce, fp8_communication, use_zbv)
+
+
 def linear_gather_forward_reducescatter_backward(
     input_, weight, bias, process_group, async_grad_reduce_scatter, dim, overlap, ring=False
 ):
diff --git a/colossalai/shardformer/layer/linear.py b/colossalai/shardformer/layer/linear.py
index a8a3be63a1a9..cb1496a0b4b0 100644
--- a/colossalai/shardformer/layer/linear.py
+++ b/colossalai/shardformer/layer/linear.py
@@ -25,6 +25,7 @@
 from ._operation import (
     gather_forward_reducescatter_backward,
     gather_forward_split_backward,
+    linear_base,
     linear_gather_forward_reducescatter_backward,
     linear_reducescatter_forward_gather_backward,
     linear_with_async_comm,
@@ -35,7 +36,159 @@
 from .parallel_module import PaddingParallelModule, ParallelModule
 from .utils import create_randomizer_with_offset
 
-__all__ = ["Linear1D_Col", "Linear1D_Row"]
+__all__ = ["Linear1D", "Linear1D_Col", "Linear1D_Row"]
+
+
+class Linear1D(ParallelModule):
+    r"""Linear layer with no parallelism.
+
+    Args:
+        in_features (int): size of each input sample.
+        out_features (int): size of each output sample.
+        bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
+        dtype (`torch.dtype`): The dtype of parameters, defaults to None.
+        device (`torch.device`): The device of parameters, defaults to None.
+        gather_output (bool, optional): If true, call all-gather on output and make Y available
+                    to all GPUs, otherwise, every GPU will have its output
+                    which is :math:`Y_i = XA_i`, defaults to False
+        seq_parallel (`bool`): If set to ``True``, it will use sequence parallel, defaults to False.
+        overlap (`bool`): If set to ``True``, it will overlap input all-gather with gradient computation during backward, defaults to False.
+        skip_bias_add (bool): If set to ``True``, it will skip bias add for linear layer,
+            which is preserved for kernel fusion, defaults to False
+        weight_initializer (`typing.Callable`):
+            The initializer of weight, defaults to kaiming uniform initializer.
+        bias_initializer (`typing.Callable`):
+            The initializer of bias, defaults to xavier uniform initializer.
+
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        dtype: torch.dtype = None,
+        device: torch.device = None,
+        gather_output: bool = False,
+        seq_parallel_mode: str = None,
+        seq_parallel_dim: int = 1,
+        overlap: torch.cuda.Stream = None,
+        skip_bias_add: bool = False,
+        weight: Optional[Parameter] = None,
+        bias_: Optional[Parameter] = None,
+        weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+        bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
+        fp8_communication: bool = False,
+        use_zbv: bool = False,
+        **kwargs,
+    ):
+        super().__init__(weight=weight, bias_=bias_, **kwargs)
+
+        # Keep input parameters
+        self.in_features = in_features
+        self.out_features = out_features
+        self.gather_output = gather_output
+        self.seq_parallel_mode = seq_parallel_mode
+        self.seq_parallel_dim = seq_parallel_dim
+        self.overlap = overlap
+        self.skip_bias_add = skip_bias_add
+        self.device = device
+        self.fp8_communication = fp8_communication
+        self.use_zbv = use_zbv
+
+        if skip_bias_add and not bias:
+            raise ValueError("cannot skip bias addition if bias is None")
+
+        # offset the seed with randomizer index and rank
+        seed = torch.random.initial_seed()
+
+        self.randomizer = create_randomizer_with_offset(seed, process_group=None)
+
+        # sanity check
+        if weight is not None:
+            assert not bias or bias_ is not None, "bias_ must be provided if bias is True when weight is not None"
+        else:
+            assert bias_ is None, "bias_ must be None if weight is None"
+
+        # Parameters.
+        if weight is None:
+            factory_kwargs = {"device": device, "dtype": dtype}
+            self.weight = Parameter(torch.empty(self.out_features, self.in_features, **factory_kwargs))
+        else:
+            weight.data = weight.data.to(device=device, dtype=dtype)
+            self.weight = weight
+
+        if bias:
+            if bias_ is None:
+                self.bias = Parameter(torch.empty(self.out_features, **factory_kwargs))
+            else:
+                bias_.data = bias_.data.to(device=device, dtype=dtype)
+                self.bias = bias_
+        else:
+            self.bias = None
+
+        if weight is None:
+            # init weights
+            self.reset_parameters(weight_initializer, bias_initializer)
+
+    @staticmethod
+    def from_native_module(module: nn.Linear, **kwargs) -> ParallelModule:
+        r"""
+        Convert a native PyTorch linear layer to a parallelized linear layer.
+        """
+        LazyInitContext.materialize(module)
+        # get the attributes
+        in_features = module.in_features
+        out_features = module.out_features
+        bias = module.bias is not None
+        device = module.weight.device
+
+        linear_1d = Linear1D(
+            in_features=in_features,
+            out_features=out_features,
+            bias=bias,
+            device=device,
+            weight=module.weight,
+            bias_=module.bias,
+            **kwargs,
+        )
+
+        return linear_1d
+
+    def reset_parameters(self, weight_initializer, bias_initializer) -> None:
+        with self.randomizer.fork_rng(enable_cpu=True):
+            fan_in, fan_out = self.in_features, self.out_features
+            weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
+            if self.bias is not None:
+                bias_initializer(self.bias, fan_in=fan_in)
+
+    def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
+        assert (
+            input_.shape[-1] == self.weight.shape[-1]
+        ), "Invalid shapes in Linear1D_Col forward: input={}, weight={}. Expected last dim of input {}.".format(
+            input_.shape, self.weight.shape, self.weight.shape[-1]
+        )
+
+        # Set up backprop all-reduce.
+        input_parallel = input_
+
+        # Matrix multiply.
+        bias = self.bias if not self.skip_bias_add else None
+        output_parallel = linear_base(
+            input_parallel,
+            self.weight,
+            bias,
+            False,
+            fp8_communication=self.fp8_communication,
+            use_zbv=self.use_zbv,
+        )
+
+        output = output_parallel
+
+        if self.skip_bias_add:
+            return output, self.bias
+        else:
+            return output
 
 
 class Linear1D_Col(ParallelModule):
diff --git a/tests/test_shardformer/test_layer/test_linear_1d.py b/tests/test_shardformer/test_layer/test_linear_1d.py
index 541aa3251400..0556bc986c66 100644
--- a/tests/test_shardformer/test_layer/test_linear_1d.py
+++ b/tests/test_shardformer/test_layer/test_linear_1d.py
@@ -8,7 +8,8 @@
 
 import colossalai
 from colossalai.lazy import LazyInitContext
-from colossalai.shardformer.layer import Linear1D_Col, Linear1D_Row
+from colossalai.pipeline.weight_grad_store import WeightGradStore
+from colossalai.shardformer.layer import Linear1D, Linear1D_Col, Linear1D_Row
 from colossalai.tensor.d_tensor import is_distributed_tensor
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
 
@@ -117,6 +118,93 @@ def check_linear_1d_row(lazy_init: bool, seq_parallel_mode: bool):
     assert_close(x_for_unshard.grad, x_for_shard.grad)
 
 
+def check_linear_without_weight_grad_store(lazy_init: bool, seq_parallel_mode: bool):
+    ctx = LazyInitContext() if lazy_init else nullcontext()
+
+    linear = nn.Linear(32, 128).cuda()
+    with ctx:
+        linear_copy = nn.Linear(32, 128).cuda()
+    linear_base = Linear1D.from_native_module(
+        linear_copy, parallel_input=False, seq_parallel_mode=seq_parallel_mode, use_zbv=False
+    )
+    assert linear_base.weight.shape == torch.Size([128, 32])
+    assert linear_base.bias.shape == torch.Size([128])
+    assert linear_copy.weight is linear_base.weight
+    assert linear_copy.bias is linear_base.bias
+
+    linear.load_state_dict(linear_base.state_dict())
+    linear_base.load_state_dict(linear.state_dict())
+
+    # check computation correctness
+    # [batch_size, seq_len, hidden_size]
+    x = torch.rand(2, 4, 32).cuda()
+    x_for_unshard = x.expand_as(x.clone())
+    x_for_unshard.requires_grad_(True)
+    x_for_shard = x.expand_as(x.clone())
+    x_for_shard.requires_grad_(True)
+
+    # run forward
+    out = linear(x_for_unshard)
+    gather_out = linear_base(x_for_shard)
+    assert_close(out, gather_out)
+
+    # check backward correctness
+    out.sum().backward()
+    gather_out.sum().backward()
+    assert_close(linear.weight.grad, linear_base.weight.grad)
+    # check the input gradients
+    assert x_for_shard.grad is not None
+    assert x_for_unshard.grad is not None
+    assert_close(x_for_unshard.grad, x_for_shard.grad)
+
+
+def check_linear_with_weight_grad_store(lazy_init: bool, seq_parallel_mode: bool):
+    ctx = LazyInitContext() if lazy_init else nullcontext()
+
+    linear = nn.Linear(32, 128).cuda()
+    with ctx:
+        linear_copy = nn.Linear(32, 128).cuda()
+    linear_base = Linear1D.from_native_module(
+        linear_copy, parallel_input=False, seq_parallel_mode=seq_parallel_mode, use_zbv=True
+    )
+    assert linear_base.weight.shape == torch.Size([128, 32])
+    assert linear_base.bias.shape == torch.Size([128])
+    assert linear_copy.weight is linear_base.weight
+    assert linear_copy.bias is linear_base.bias
+
+    linear.load_state_dict(linear_base.state_dict())
+    linear_base.load_state_dict(linear.state_dict())
+
+    # check computation correctness
+    # [batch_size, seq_len, hidden_size]
+    x = torch.rand(2, 4, 32).cuda()
+    x_for_unshard = x.expand_as(x.clone())
+    x_for_unshard.requires_grad_(True)
+    x_for_shard = x.expand_as(x.clone())
+    x_for_shard.requires_grad_(True)
+
+    # run forward
+    out = linear(x_for_unshard)
+    gather_out = linear_base(x_for_shard)
+    assert_close(out, gather_out)
+
+    # check backward correctness
+    out.sum().backward()
+    gather_out.sum().backward()
+
+    # Weight grad is None before we do WeightGradStore pop
+    assert linear_base.weight.grad is None
+    # after WeightGradStore pop (dw computation complete), we assert weight grad
+    WeightGradStore.flush(chunk=0)  # flush buffer to chunk 0 Queue
+    WeightGradStore.pop(chunk=0)
+    assert_close(linear.weight.grad, linear_base.weight.grad)
+
+    # check the input gradients
+    assert x_for_shard.grad is not None
+    assert x_for_unshard.grad is not None
+    assert_close(x_for_unshard.grad, x_for_shard.grad)
+
+
 def check_linear_col_plus_row(lazy_init: bool, seq_parallel_mode: bool, overlap: bool):
     ctx = LazyInitContext() if lazy_init else nullcontext()
 
@@ -182,6 +270,8 @@ def run_dist_linear_test(lazy_init, seq_parallel_mode, overlap):
     check_linear_1d_col(lazy_init, seq_parallel_mode, overlap)
     check_linear_1d_row(lazy_init, seq_parallel_mode)
     check_linear_col_plus_row(lazy_init, seq_parallel_mode, overlap)
+    check_linear_without_weight_grad_store(lazy_init, seq_parallel_mode)
+    check_linear_with_weight_grad_store(lazy_init, seq_parallel_mode)
 
 
 def check_dist_linear(rank, world_size, port):

From 5f0924361de4e87f05cbf8aadf9fbd698873a53d Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Thu, 31 Oct 2024 08:18:28 +0000
Subject: [PATCH 161/167] [fix] fix linear (no tp) ops func name;

---
 colossalai/shardformer/layer/__init__.py      |  4 ++--
 colossalai/shardformer/layer/_operation.py    | 10 ++++-----
 colossalai/shardformer/layer/linear.py        | 21 +++++--------------
 colossalai/shardformer/policies/mixtral.py    | 15 +++----------
 examples/language/llama/benchmark.py          |  4 ++--
 .../test_layer/test_linear_1d.py              |  6 +++---
 6 files changed, 19 insertions(+), 41 deletions(-)

diff --git a/colossalai/shardformer/layer/__init__.py b/colossalai/shardformer/layer/__init__.py
index 613ce73c3cf2..4fc714e57cd4 100644
--- a/colossalai/shardformer/layer/__init__.py
+++ b/colossalai/shardformer/layer/__init__.py
@@ -2,7 +2,7 @@
 from .attn import AttnMaskType, ColoAttention, RingAttention, get_pad_info
 from .dropout import DropoutForParallelInput, DropoutForReplicatedInput
 from .embedding import Embedding1D, PaddingEmbedding, VocabParallelEmbedding1D
-from .linear import Linear1D, Linear1D_Col, Linear1D_Row, PaddingLMHead, VocabParallelLMHead1D
+from .linear import Linear1D_Col, Linear1D_Row, LinearWithGradAccum, PaddingLMHead, VocabParallelLMHead1D
 from .loss import cross_entropy_1d, dist_cross_entropy
 from .normalization import FusedLayerNorm, FusedRMSNorm, LayerNorm, RMSNorm
 from .parallel_module import ParallelModule
@@ -11,7 +11,7 @@
 __all__ = [
     "Embedding1D",
     "VocabParallelEmbedding1D",
-    "Linear1D",
+    "LinearWithGradAccum",
     "Linear1D_Col",
     "Linear1D_Row",
     "GPT2FusedLinearConv1D_Col",
diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index 46f50ef0279f..8a068b78cbd7 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -235,17 +235,16 @@ def execute_w_pass(_input_, _grad_output_, _weight_main_grad_=None, wgrad_gemm_f
         return grad_input, grad_weight, grad_bias, None, None, None, None
 
 
-class LinearBase(torch.autograd.Function):
+class LinearWithGradAccum(torch.autograd.Function):
     """
     Linear layer baseline (no tensor parallel version).
     """
 
     @staticmethod
-    def forward(ctx, input_, weight, bias, async_grad_allreduce, fp8_communication=False, use_zbv=False):
+    def forward(ctx, input_, weight, bias, async_grad_allreduce, use_zbv=False):
         ctx.save_for_backward(input_, weight, bias)
         ctx.use_bias = bias is not None
         ctx.async_grad_allreduce = async_grad_allreduce
-        ctx.fp8_communication = fp8_communication
         ctx.use_zbv = use_zbv
         if bias is not None:
             output = F.linear(input_, weight, bias)
@@ -258,7 +257,6 @@ def forward(ctx, input_, weight, bias, async_grad_allreduce, fp8_communication=F
     def backward(ctx, grad_output):
         input, weight, bias = ctx.saved_tensors
         use_bias = ctx.use_bias
-        ctx.fp8_communication
         use_zbv = ctx.use_zbv
 
         def execute_w_pass_grad_accum(_input_, _grad_output_, _weight_main_grad_, wgrad_gemm_accum_func=None):
@@ -1201,8 +1199,8 @@ def linear_with_async_comm(
     )
 
 
-def linear_base(input_, weight, bias, async_grad_allreduce, fp8_communication=False, use_zbv=False):
-    return LinearBase.apply(input_, weight, bias, async_grad_allreduce, fp8_communication, use_zbv)
+def linear_with_grad_accum(input_, weight, bias, async_grad_allreduce, use_zbv=False):
+    return LinearWithGradAccum.apply(input_, weight, bias, async_grad_allreduce, use_zbv)
 
 
 def linear_gather_forward_reducescatter_backward(
diff --git a/colossalai/shardformer/layer/linear.py b/colossalai/shardformer/layer/linear.py
index cb1496a0b4b0..040a93e5a7b9 100644
--- a/colossalai/shardformer/layer/linear.py
+++ b/colossalai/shardformer/layer/linear.py
@@ -25,10 +25,10 @@
 from ._operation import (
     gather_forward_reducescatter_backward,
     gather_forward_split_backward,
-    linear_base,
     linear_gather_forward_reducescatter_backward,
     linear_reducescatter_forward_gather_backward,
     linear_with_async_comm,
+    linear_with_grad_accum,
     reduce_forward,
     reducescatter_forward_gather_backward,
     split_forward_gather_backward,
@@ -36,10 +36,10 @@
 from .parallel_module import PaddingParallelModule, ParallelModule
 from .utils import create_randomizer_with_offset
 
-__all__ = ["Linear1D", "Linear1D_Col", "Linear1D_Row"]
+__all__ = ["LinearWithGradAccum", "Linear1D_Col", "Linear1D_Row"]
 
 
-class Linear1D(ParallelModule):
+class LinearWithGradAccum(ParallelModule):
     r"""Linear layer with no parallelism.
 
     Args:
@@ -69,16 +69,11 @@ def __init__(
         bias: bool = True,
         dtype: torch.dtype = None,
         device: torch.device = None,
-        gather_output: bool = False,
-        seq_parallel_mode: str = None,
-        seq_parallel_dim: int = 1,
-        overlap: torch.cuda.Stream = None,
         skip_bias_add: bool = False,
         weight: Optional[Parameter] = None,
         bias_: Optional[Parameter] = None,
         weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
         bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
-        fp8_communication: bool = False,
         use_zbv: bool = False,
         **kwargs,
     ):
@@ -87,13 +82,8 @@ def __init__(
         # Keep input parameters
         self.in_features = in_features
         self.out_features = out_features
-        self.gather_output = gather_output
-        self.seq_parallel_mode = seq_parallel_mode
-        self.seq_parallel_dim = seq_parallel_dim
-        self.overlap = overlap
         self.skip_bias_add = skip_bias_add
         self.device = device
-        self.fp8_communication = fp8_communication
         self.use_zbv = use_zbv
 
         if skip_bias_add and not bias:
@@ -143,7 +133,7 @@ def from_native_module(module: nn.Linear, **kwargs) -> ParallelModule:
         bias = module.bias is not None
         device = module.weight.device
 
-        linear_1d = Linear1D(
+        linear_1d = LinearWithGradAccum(
             in_features=in_features,
             out_features=out_features,
             bias=bias,
@@ -174,12 +164,11 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
 
         # Matrix multiply.
         bias = self.bias if not self.skip_bias_add else None
-        output_parallel = linear_base(
+        output_parallel = linear_with_grad_accum(
             input_parallel,
             self.weight,
             bias,
             False,
-            fp8_communication=self.fp8_communication,
             use_zbv=self.use_zbv,
         )
 
diff --git a/colossalai/shardformer/policies/mixtral.py b/colossalai/shardformer/policies/mixtral.py
index 11291169a442..ece72d929eec 100644
--- a/colossalai/shardformer/policies/mixtral.py
+++ b/colossalai/shardformer/policies/mixtral.py
@@ -52,10 +52,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
         sp_group = self.shard_config.sequence_parallel_process_group or None
         sp_partial_derived = sp_mode in ["split_gather", "ring"]
         tp_size = self.shard_config.tensor_parallel_size
-        if self.pipeline_stage_manager:
-            use_zbv = self.pipeline_stage_manager.use_zbv
-        else:
-            use_zbv = False
+        use_zbv = self.pipeline_stage_manager is not None and self.pipeline_stage_manager.use_zbv
 
         # modified for both SP and TP
         num_q_heads = self.model.config.num_attention_heads
@@ -334,10 +331,7 @@ def get_shared_params(self) -> List[Dict[int, Tensor]]:
 class MixtralForCausalLMPolicy(MixtralPolicy):
     def module_policy(self):
         policy = super().module_policy()
-        if self.pipeline_stage_manager:
-            use_zbv = self.pipeline_stage_manager.use_zbv
-        else:
-            use_zbv = False
+        use_zbv = self.pipeline_stage_manager is not None and self.pipeline_stage_manager.use_zbv
         # TODO: assign pg mesh from plugin to all modules
         if self.shard_config.enable_tensor_parallelism:
             # add a new item for causal lm
@@ -400,10 +394,7 @@ def module_policy(self):
         from transformers import MixtralForSequenceClassification
 
         policy = super().module_policy()
-        if self.pipeline_stage_manager:
-            use_zbv = self.pipeline_stage_manager.use_zbv
-        else:
-            use_zbv = False
+        use_zbv = self.pipeline_stage_manager is not None and self.pipeline_stage_manager.use_zbv
 
         if self.shard_config.enable_tensor_parallelism:
             # add a new item for sequence classification
diff --git a/examples/language/llama/benchmark.py b/examples/language/llama/benchmark.py
index 68ceb9ac1984..4976f0c378ec 100644
--- a/examples/language/llama/benchmark.py
+++ b/examples/language/llama/benchmark.py
@@ -366,10 +366,10 @@ def empty_init():
                 )
                 loss = outputs["loss"]
                 if args.pp_style == "zbv":
-                    if dist.get_rank() == 0:
+                    if coordinator.is_master():
                         print(f"Step {step} loss: {loss}")
                 else:
-                    if dist.get_rank() == dist.get_world_size() - 1:
+                    if coordinator.is_last_process():
                         print(f"Step {step} loss: {loss}")
                 optimizer.step()
                 optimizer.zero_grad()
diff --git a/tests/test_shardformer/test_layer/test_linear_1d.py b/tests/test_shardformer/test_layer/test_linear_1d.py
index 0556bc986c66..773799c1cc09 100644
--- a/tests/test_shardformer/test_layer/test_linear_1d.py
+++ b/tests/test_shardformer/test_layer/test_linear_1d.py
@@ -9,7 +9,7 @@
 import colossalai
 from colossalai.lazy import LazyInitContext
 from colossalai.pipeline.weight_grad_store import WeightGradStore
-from colossalai.shardformer.layer import Linear1D, Linear1D_Col, Linear1D_Row
+from colossalai.shardformer.layer import Linear1D_Col, Linear1D_Row, LinearWithGradAccum
 from colossalai.tensor.d_tensor import is_distributed_tensor
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
 
@@ -124,7 +124,7 @@ def check_linear_without_weight_grad_store(lazy_init: bool, seq_parallel_mode: b
     linear = nn.Linear(32, 128).cuda()
     with ctx:
         linear_copy = nn.Linear(32, 128).cuda()
-    linear_base = Linear1D.from_native_module(
+    linear_base = LinearWithGradAccum.from_native_module(
         linear_copy, parallel_input=False, seq_parallel_mode=seq_parallel_mode, use_zbv=False
     )
     assert linear_base.weight.shape == torch.Size([128, 32])
@@ -164,7 +164,7 @@ def check_linear_with_weight_grad_store(lazy_init: bool, seq_parallel_mode: bool
     linear = nn.Linear(32, 128).cuda()
     with ctx:
         linear_copy = nn.Linear(32, 128).cuda()
-    linear_base = Linear1D.from_native_module(
+    linear_base = LinearWithGradAccum.from_native_module(
         linear_copy, parallel_input=False, seq_parallel_mode=seq_parallel_mode, use_zbv=True
     )
     assert linear_base.weight.shape == torch.Size([128, 32])

From c2e8f61592011732eab54e2ffacd2de44fdd8096 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Thu, 31 Oct 2024 17:04:53 +0800
Subject: [PATCH 162/167] [checkpointio] fix hybrid plugin model save (#6106)

---
 .../hybrid_parallel_checkpoint_io.py          |  9 +++--
 colossalai/utils/__init__.py                  |  2 ++
 colossalai/utils/common.py                    | 34 ++++++++++++++++++-
 colossalai/zero/gemini/gemini_ddp.py          | 34 ++-----------------
 4 files changed, 41 insertions(+), 38 deletions(-)

diff --git a/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py b/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
index e6abf59e3ac3..79bb33dca3a4 100644
--- a/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
+++ b/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
@@ -21,7 +21,7 @@
     to_padded_tensor,
     to_unpadded_tensor,
 )
-from colossalai.utils import get_current_device
+from colossalai.utils import get_current_device, get_non_persistent_buffers_set
 
 from .general_checkpoint_io import GeneralCheckpointIO
 from .index_file import CheckpointIndexFile
@@ -105,8 +105,9 @@ def _model_sharder(
                 yield block, block_size
 
         # Save buffers.
+        non_persist_buffers_set = get_non_persistent_buffers_set(model)
         for name, buf in model.named_buffers():
-            if buf is not None and name not in model._non_persistent_buffers_set:
+            if buf is not None and name not in non_persist_buffers_set:
                 buffer = buf if keep_vars else buf.detach()
                 block, block_size = state_dict_sharder.append_param(prefix + name, buffer)
                 if block is not None:
@@ -352,9 +353,7 @@ def _load(name: str):
             _load(name)
 
         # Load buffers.
-        non_persistent_buffers = set()
-        for n, m in model.named_modules():
-            non_persistent_buffers |= set(".".join((n, b)) for b in m._non_persistent_buffers_set)
+        non_persistent_buffers = get_non_persistent_buffers_set(model)
         for name, buf in model.named_buffers():
             if buf is not None and name not in non_persistent_buffers:
                 _load(name)
diff --git a/colossalai/utils/__init__.py b/colossalai/utils/__init__.py
index cdba467091be..1605a5f4eb3b 100644
--- a/colossalai/utils/__init__.py
+++ b/colossalai/utils/__init__.py
@@ -5,6 +5,7 @@
     ensure_path_exists,
     free_storage,
     get_current_device,
+    get_non_persistent_buffers_set,
     is_ddp_ignored,
     set_seed,
 )
@@ -25,4 +26,5 @@
     "set_seed",
     "get_current_device",
     "is_ddp_ignored",
+    "get_non_persistent_buffers_set",
 ]
diff --git a/colossalai/utils/common.py b/colossalai/utils/common.py
index 4a1889eb57ff..0863a812b183 100644
--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@@ -5,10 +5,11 @@
 import random
 from contextlib import contextmanager
 from pathlib import Path
-from typing import Callable
+from typing import Callable, Optional, Set
 
 import numpy as np
 import torch
+import torch.nn as nn
 
 from colossalai.accelerator import get_accelerator
 
@@ -76,3 +77,34 @@ def set_seed(seed):
     random.seed(seed)
     np.random.seed(seed)
     torch.manual_seed(seed)
+
+
+def get_non_persistent_buffers_set(
+    module, memo: Optional[Set[nn.Module]] = None, prefix: str = "", remove_duplicate: bool = True
+):
+    r"""
+    Args:
+        memo: a memo to store the set of modules already added to the result
+        prefix: a prefix that will be added to the name of the module
+        remove_duplicate: whether to remove the duplicated module instances in the result
+            or not
+    """
+
+    if memo is None:
+        memo = set()
+    self_non_persistent_set = set()
+    if module not in memo:
+        if remove_duplicate:
+            memo.add(module)
+        self_non_persistent_set = set(
+            map(lambda key: prefix + ("." if prefix else "") + key, module._non_persistent_buffers_set)
+        )
+        for name, sub_module in module._modules.items():
+            if sub_module is None:
+                continue
+            submodule_prefix = prefix + ("." if prefix else "") + name
+            child_non_persistent_set = get_non_persistent_buffers_set(
+                sub_module, memo, submodule_prefix, remove_duplicate
+            )
+            self_non_persistent_set = set.union(self_non_persistent_set, child_non_persistent_set)
+    return self_non_persistent_set
diff --git a/colossalai/zero/gemini/gemini_ddp.py b/colossalai/zero/gemini/gemini_ddp.py
index dbaae66108fe..680ff07fd607 100644
--- a/colossalai/zero/gemini/gemini_ddp.py
+++ b/colossalai/zero/gemini/gemini_ddp.py
@@ -35,7 +35,7 @@
     to_unpadded_tensor,
 )
 from colossalai.tensor.param_op_hook import ColoParamOpHookManager
-from colossalai.utils import _cast_float, free_storage, is_ddp_ignored
+from colossalai.utils import _cast_float, free_storage, get_non_persistent_buffers_set, is_ddp_ignored
 
 from .chunk import Chunk, ChunkManager, TensorState, init_chunk_manager
 from .gemini_hook import GeminiZeROHook
@@ -187,7 +187,7 @@ def __init__(
             pin_memory=pin_memory,
         )
         super().__init__(module)
-        self._non_persistent_buffers_set = self._get_non_persistent_buffers_set(module)
+        self._non_persistent_buffers_set = get_non_persistent_buffers_set(module)
         self._cast_buffers()
 
         # register grad hook
@@ -257,36 +257,6 @@ def set_params_to_ignore(params_to_ignore: Iterable[torch.Tensor]) -> None:
         for p in params_to_ignore:
             p._ddp_to_ignore = True
 
-    def _get_non_persistent_buffers_set(
-        self, module, memo: Optional[Set[nn.Module]] = None, prefix: str = "", remove_duplicate: bool = True
-    ):
-        r"""
-        Args:
-            memo: a memo to store the set of modules already added to the result
-            prefix: a prefix that will be added to the name of the module
-            remove_duplicate: whether to remove the duplicated module instances in the result
-                or not
-        """
-
-        if memo is None:
-            memo = set()
-        self_non_persistent_set = set()
-        if module not in memo:
-            if remove_duplicate:
-                memo.add(module)
-            self_non_persistent_set = set(
-                map(lambda key: prefix + ("." if prefix else "") + key, module._non_persistent_buffers_set)
-            )
-            for name, sub_module in module._modules.items():
-                if sub_module is None:
-                    continue
-                submodule_prefix = prefix + ("." if prefix else "") + name
-                child_non_persistent_set = self._get_non_persistent_buffers_set(
-                    sub_module, memo, submodule_prefix, remove_duplicate
-                )
-                self_non_persistent_set = set.union(self_non_persistent_set, child_non_persistent_set)
-        return self_non_persistent_set
-
     def _post_forward(self):
         """This function is only triggered for inference."""
         access_list = list(self.chunk_manager.accessed_chunks)

From 2f583c154944b1f00d7a4fb1ce529db9d8595056 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 31 Oct 2024 18:18:01 +0800
Subject: [PATCH 163/167] [pre-commit.ci] pre-commit autoupdate (#6078)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/psf/black-pre-commit-mirror: 24.8.0 → 24.10.0](https://github.com/psf/black-pre-commit-mirror/compare/24.8.0...24.10.0)
- [github.com/pre-commit/mirrors-clang-format: v18.1.8 → v19.1.2](https://github.com/pre-commit/mirrors-clang-format/compare/v18.1.8...v19.1.2)
- [github.com/pre-commit/pre-commit-hooks: v4.6.0 → v5.0.0](https://github.com/pre-commit/pre-commit-hooks/compare/v4.6.0...v5.0.0)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .pre-commit-config.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f7217a8f1e86..bee6387a701c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -15,21 +15,21 @@ repos:
         args: ["--profile", "black"] # avoid conflict with black
 
   - repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 24.8.0
+    rev: 24.10.0
     hooks:
     - id: black
       name: black formatter
       args: ['--line-length=120', '--target-version=py37', '--target-version=py38', '--target-version=py39','--target-version=py310']
 
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v18.1.8
+    rev: v19.1.2
     hooks:
     - id: clang-format
       name: clang formatter
       types_or: [c++, c]
 
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v5.0.0
     hooks:
       - id: check-yaml
       - id: check-merge-conflict

From 3b5c314bea0c7947fd91e26731a427b2e536b8d4 Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Fri, 1 Nov 2024 03:54:08 +0000
Subject: [PATCH 164/167] [fix] fix fp8 args in HybridParallel

---
 colossalai/booster/plugin/hybrid_parallel_plugin.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index 752e8e1e874d..1af20f473578 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -722,8 +722,6 @@ def __init__(
             overlap_allgather=overlap_allgather,
             fp8_communication=fp8_communication,
             backward_context=model._hook_context,
-            fp8_communication=fp8_communication,
-            backward_context=model._hook_context,
         )
 
     def sync_dp_grads(self):
@@ -1162,7 +1160,6 @@ def __init__(
                     enable_metadata_cache=enable_metadata_cache,
                     overlap_p2p=overlap_p2p,
                     fp8_communication=fp8_communication,
-                    fp8_communication=fp8_communication,
                 )
             elif pp_style == "1f1b":
                 self.scheduler = OneForwardOneBackwardSchedule(
@@ -1213,7 +1210,6 @@ def __init__(
             make_vocab_size_divisible_by=make_vocab_size_divisible_by,
             gradient_checkpoint_config=gradient_checkpoint_config,
             fp8_communication=fp8_communication,
-            fp8_communication=fp8_communication,
             inner_ring_size=inner_ring_size,
             pg_mesh=self.pg_mesh,
             sp_axis=self.sp_axis,
@@ -1247,7 +1243,6 @@ def __init__(
             forced_dtype=PRECISION_TORCH_TYPE[precision],
             overlap_allgather=overlap_allgather,
             fp8_communication=fp8_communication,
-            fp8_communication=fp8_communication,
         )
 
         self.max_norm = max_norm

From 5b5fbcff09092ccecf54dde05dc6ee25235d98b2 Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Fri, 1 Nov 2024 05:27:11 +0000
Subject: [PATCH 165/167] [fix] fix hybridparall use_fp8 config

---
 colossalai/booster/plugin/hybrid_parallel_plugin.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/colossalai/booster/plugin/hybrid_parallel_plugin.py b/colossalai/booster/plugin/hybrid_parallel_plugin.py
index 1af20f473578..58d055bb06af 100644
--- a/colossalai/booster/plugin/hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/hybrid_parallel_plugin.py
@@ -78,7 +78,6 @@ def __init__(
         self.require_grad_sync = True
         self.overlap_allgather = overlap_allgather
         self.use_fp8 = use_fp8
-        self.use_fp8 = use_fp8
 
         shardformer = ShardFormer(shard_config)
         if custom_policy is not None:
@@ -1099,7 +1098,6 @@ def __init__(
         self.enable_jit_fused = enable_jit_fused
         self.enable_sequence_parallelism = enable_sequence_parallelism
         self.use_fp8 = use_fp8
-        self.use_fp8 = use_fp8
         if dp_outside:
             self.dp_axis, self.pp_axis, self.tp_axis, self.sp_axis = 0, 1, 2, 3
             self.pg_mesh = ProcessGroupMesh(self.dp_size, self.pp_size, self.tp_size, self.sp_size)
@@ -1325,7 +1323,6 @@ def configure(
                 custom_policy=self.custom_policy,
                 overlap_allgather=(self.zero_stage > 0 and self.zero_config["overlap_allgather"]),
                 use_fp8=self.use_fp8,
-                use_fp8=self.use_fp8,
             )
         if optimizer is not None and not isinstance(optimizer, OptimizerWrapper):
             if zero_stage == 0:

From 0218e673db79eda513a71054694b8845a4b1ee1b Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Fri, 1 Nov 2024 07:05:24 +0000
Subject: [PATCH 166/167] [fix] fix use_fp8 flag

---
 colossalai/shardformer/layer/_operation.py   | 6 ++----
 tests/kit/model_zoo/transformers/__init__.py | 3 ++-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py
index d918076075e6..8c2e6e7c5d92 100644
--- a/colossalai/shardformer/layer/_operation.py
+++ b/colossalai/shardformer/layer/_operation.py
@@ -723,9 +723,7 @@ class _MatmulWithGatherForwardReduceScatterBackward(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(
-        ctx, input_, weight, bias, process_group, async_grad_reduce_scatter, dim, overlap, ring, fp8_communication
-    ):
+    def forward(ctx, input_, weight, bias, process_group, async_grad_reduce_scatter, dim, ring, fp8_communication):
         ctx.save_for_backward(input_, weight, bias)
         ctx.use_bias = bias is not None
         ctx.process_group = process_group
@@ -793,7 +791,7 @@ def backward(ctx, grad_output):
         if ctx.async_grad_reduce_scatter:
             handle.wait()
 
-        return output, grad_weight, grad_bias, None, None, None, None, None, None
+        return output, grad_weight, grad_bias, None, None, None, None, None
 
 
 class _SplitForwardGatherBackward(torch.autograd.Function):
diff --git a/tests/kit/model_zoo/transformers/__init__.py b/tests/kit/model_zoo/transformers/__init__.py
index 4adc386192d3..02996823166a 100644
--- a/tests/kit/model_zoo/transformers/__init__.py
+++ b/tests/kit/model_zoo/transformers/__init__.py
@@ -2,7 +2,8 @@
 from .bert import *
 from .blip2 import *
 from .bloom import *
-from .chatglm2 import *
+
+# from .chatglm2 import *
 from .command import *
 from .deepseek import *
 from .falcon import *

From 8e400876332971ac1e4b6aea146d23a1fbdf67a7 Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Fri, 1 Nov 2024 09:02:07 +0000
Subject: [PATCH 167/167] [fix] fix model zoo init

---
 tests/kit/model_zoo/transformers/__init__.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/kit/model_zoo/transformers/__init__.py b/tests/kit/model_zoo/transformers/__init__.py
index 02996823166a..4adc386192d3 100644
--- a/tests/kit/model_zoo/transformers/__init__.py
+++ b/tests/kit/model_zoo/transformers/__init__.py
@@ -2,8 +2,7 @@
 from .bert import *
 from .blip2 import *
 from .bloom import *
-
-# from .chatglm2 import *
+from .chatglm2 import *
 from .command import *
 from .deepseek import *
 from .falcon import *