From 061ee4a95e2376c2977cc93d9aaddb34574c7c3b Mon Sep 17 00:00:00 2001
From: Edenzzzz <wenxuan.tan@wisc.edu>
Date: Wed, 29 May 2024 16:22:10 +0800
Subject: [PATCH] [Feauture] MoE refractor; Intergration with Mixtral  (#5682)

* cherry pick from refractor-moe branch

* tests passed

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* support ep + zero

---------

Co-authored-by: Edenzzzz <wtan45@wisc.edu>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .github/workflows/build_on_pr.yml             |   2 +-
 LICENSE                                       |   2 +-
 .../benchmarks/benchmark_opt_lora_dummy.py    |   6 +-
 .../Chat/coati/dataset/sft_dataset.py         |  16 +-
 applications/Chat/coati/models/base/actor.py  |   1 -
 applications/Chat/coati/ray/utils.py          |   4 +-
 .../Chat/coati/trainer/strategies/ddp.py      |   3 +-
 .../community/peft/train_peft_prompts.py      |   4 +-
 applications/Chat/examples/train_sft.sh       |   2 +-
 .../colossal_llama2/model/init_model.py       |   3 +-
 .../colossal_llama2/utils/ckpt_io.py          |   2 +-
 .../utils/stream_chat_patch.py                | 119 ++-
 .../Colossal-LLaMA-2/docs/example_13b.md      |   6 +-
 .../Colossal-LLaMA-2/docs/example_7b.md       |   2 +-
 .../Colossal-LLaMA-2/hostfile.example         |   2 +-
 .../Colossal-LLaMA-2/inference_example.py     |   2 +-
 .../Colossal-LLaMA-2/requirements.txt         |   1 -
 .../Colossal-LLaMA-2/stream_chat_example.py   |  45 +-
 applications/Colossal-LLaMA-2/version.txt     |   2 +-
 .../examples/dataset_evaluation/inference.py  |   1 -
 applications/ColossalMoE/README.md            | Bin 6474 -> 6475 bytes
 applications/ColossalMoE/infer.py             |   4 +-
 .../ColossalMoE/mixtral_checkpoint.py         | 629 -----------
 .../ColossalMoE/tests/test_mixtral_layer.py   |  11 +-
 .../ColossalMoE/tests/test_moe_checkpoint.py  |  55 +-
 applications/ColossalMoE/train.py             |  21 +-
 .../colossalqa/chain/retrieval_qa/base.py     |  18 +-
 .../colossalqa/data_loader/document_loader.py |   2 +-
 .../data_loader/table_dataloader.py           |  75 +-
 .../colossalqa/local/colossalcloud_llm.py     |  66 +-
 .../ColossalQA/colossalqa/local/llm.py        |   1 -
 .../ColossalQA/colossalqa/prompt/prompt.py    |   1 -
 .../ColossalQA/colossalqa/retriever.py        |   8 +-
 .../ColossalQA/data/data_sample/companies.txt |   6 +-
 .../data/data_sample/companies_zh.txt         |   2 +-
 .../data/data_sample/csv_organization_100.csv |   2 +-
 applications/ColossalQA/data/tests/64KB.json  |   2 +-
 .../ColossalQA/data/tests/companies.csv       | 202 ++--
 applications/ColossalQA/data/tests/test.html  |  48 +-
 applications/ColossalQA/data/tests/test.md    |   4 +-
 applications/ColossalQA/data/tests/test.txt   |  76 +-
 .../retrieval_conversation_universal.py       |  33 +-
 .../examples/webui_demo/RAG_ChatBot.py        |  12 +-
 .../ColossalQA/examples/webui_demo/README.md  |   2 +-
 .../ColossalQA/examples/webui_demo/config.py  |  52 +-
 .../ColossalQA/examples/webui_demo/server.py  |  25 +-
 .../ColossalQA/examples/webui_demo/webui.py   |  31 +-
 applications/ColossalQA/pytest.ini            |   2 +-
 .../ColossalQA/tests/test_document_loader.py  |  14 +-
 .../ColossalQA/tests/test_retrieval_qa.py     |  80 +-
 applications/ColossalQA/version.txt           |   2 +-
 colossalai/__init__.py                        |   2 +-
 .../plugin/moe_hybrid_parallel_plugin.py      | 110 +-
 .../booster/plugin/torch_fsdp_plugin.py       |   9 +-
 .../hybrid_parallel_checkpoint_io.py          | 194 ++--
 colossalai/checkpoint_io/utils.py             |   4 +-
 colossalai/cluster/process_group_mesh.py      |  68 +-
 colossalai/inference/engine/modeling/llama.py |  49 +-
 .../quant/gptq/cai_gptq/cai_quant_linear.py   | 202 ++--
 colossalai/kernel/triton/context_attention.py | 133 ++-
 colossalai/kernel/triton/flash_decoding.py    |  51 +-
 .../kernel/triton/llama_act_combine_kernel.py |  65 +-
 .../kernel/triton/token_attention_kernel.py   |  20 +-
 .../tensor_parallel/modeling/llama.py         |   4 +-
 colossalai/moe/checkpoint.py                  | 976 ++++++++++--------
 colossalai/moe/load_balance.py                | 442 ++++++++
 colossalai/moe/utils.py                       |   9 +-
 colossalai/shardformer/layer/moe/__init__.py  |   3 +
 colossalai/shardformer/layer/moe/experts.py   | 161 +++
 colossalai/shardformer/layer/moe/layers.py    | 404 ++++++++
 colossalai/shardformer/layer/moe/routers.py   | 161 +++
 colossalai/shardformer/modeling/mixtral.py    |  18 +-
 colossalai/shardformer/policies/mixtral.py    |   5 +-
 colossalai/shardformer/shard/shard_config.py  |   1 +
 colossalai/tensor/d_tensor/__init__.py        |   4 +-
 colossalai/tensor/d_tensor/api.py             |   7 +-
 colossalai/tensor/moe_tensor/api.py           |   7 +-
 colossalai/zero/low_level/_utils.py           |   1 +
 colossalai/zero/low_level/low_level_optim.py  |   6 +-
 .../train_vit_with_hybrid_parallelism.md      |   2 +-
 examples/language/data_utils.py               |   2 +-
 .../openmoe/benchmark/benchmark_cai.py        |   2 +-
 examples/language/openmoe/benchmark/utils.py  |   6 +-
 examples/language/openmoe/infer.py            |  14 +-
 .../openmoe/model/convert_openmoe_ckpt.py     |  22 +-
 .../openmoe/model/modeling_openmoe.py         |  10 +-
 .../language/openmoe/model/openmoe_policy.py  |  90 +-
 examples/language/openmoe/train.py            |  44 +-
 extensions/cpu_adam/__init__.py               |   3 +-
 extensions/layernorm/__init__.py              |   2 +-
 extensions/moe/__init__.py                    |   2 +-
 extensions/optimizer/__init__.py              |   2 +-
 extensions/softmax/__init__.py                |   2 +-
 tests/kit/model_zoo/__init__.py               |  44 +-
 tests/kit/model_zoo/registry.py               |   2 +-
 tests/kit/model_zoo/transformers/chatglm2.py  |   1 +
 .../test_plugin/test_3d_plugin.py             |   2 +-
 .../test_torch_fsdp_checkpoint_io.py          |   1 +
 tests/test_gptq/test_gptq_linear.py           |  30 +-
 tests/test_lazy/test_models.py                |   2 +-
 tests/test_moe/test_moe_load_balance.py       |   2 +-
 .../test_layer/test_dist_crossentropy.py      |   5 +-
 .../test_model/test_shard_gptj.py             |   1 +
 103 files changed, 2928 insertions(+), 2192 deletions(-)
 delete mode 100644 applications/ColossalMoE/mixtral_checkpoint.py
 create mode 100644 colossalai/moe/load_balance.py
 create mode 100644 colossalai/shardformer/layer/moe/__init__.py
 create mode 100644 colossalai/shardformer/layer/moe/experts.py
 create mode 100644 colossalai/shardformer/layer/moe/layers.py
 create mode 100644 colossalai/shardformer/layer/moe/routers.py

diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
index b01d15490e0f..2cad504f3391 100644
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -201,4 +201,4 @@ jobs:
         uses: actions/upload-artifact@v3
         with:
           name: report
-          path: report/
\ No newline at end of file
+          path: report/
diff --git a/LICENSE b/LICENSE
index bacb03e72246..47197afe6644 100644
--- a/LICENSE
+++ b/LICENSE
@@ -551,4 +551,4 @@ Copyright 2021- HPC-AI Technology Inc. All rights reserved.
    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-   THE SOFTWARE.  
+   THE SOFTWARE.
diff --git a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
index 0d0e2a7d34f5..47ad29f46c64 100644
--- a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
+++ b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
@@ -76,9 +76,11 @@ def main(args):
     if args.strategy == "ddp":
         strategy = DDPStrategy()
     elif args.strategy == "colossalai_gemini":
-        strategy = GeminiStrategy(placement_policy="static",initial_scale=2**5)
+        strategy = GeminiStrategy(placement_policy="static", initial_scale=2**5)
     elif args.strategy == "colossalai_gemini_cpu":
-        strategy = GeminiStrategy(placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5)
+        strategy = GeminiStrategy(
+            placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5
+        )
     elif args.strategy == "colossalai_zero2":
         strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
     elif args.strategy == "colossalai_zero2_cpu":
diff --git a/applications/Chat/coati/dataset/sft_dataset.py b/applications/Chat/coati/dataset/sft_dataset.py
index e67e16231cc2..0167f118dc8f 100644
--- a/applications/Chat/coati/dataset/sft_dataset.py
+++ b/applications/Chat/coati/dataset/sft_dataset.py
@@ -51,11 +51,21 @@ def _preprocess(
     """Preprocess the data by tokenizing."""
     sequences = [s + t + tokenizer.eos_token for s, t in zip(sources, targets)]
     sequences_token = tokenizer(
-        sequences, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt", add_special_tokens=False
+        sequences,
+        max_length=max_length,
+        padding="max_length",
+        truncation=True,
+        return_tensors="pt",
+        add_special_tokens=False,
     )
 
     sources_token = tokenizer(
-        sources, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt", add_special_tokens=False
+        sources,
+        max_length=max_length,
+        padding="max_length",
+        truncation=True,
+        return_tensors="pt",
+        add_special_tokens=False,
     )
 
     assert sequences_token["attention_mask"].dim() == 2, "seq2seq model should be preprocessed differently"
@@ -66,7 +76,7 @@ def _preprocess(
         if tokenizer.padding_side == "right":
             # |prompt|completion|eos|pad|
             labels[i][:source_len] = IGNORE_INDEX
-            if pad_len>0:
+            if pad_len > 0:
                 labels[i][-pad_len:] = IGNORE_INDEX
         elif tokenizer.padding_side == "left":
             # |pad|prompt|completion|eos|
diff --git a/applications/Chat/coati/models/base/actor.py b/applications/Chat/coati/models/base/actor.py
index 8b2b81ed071c..0634631df7a3 100644
--- a/applications/Chat/coati/models/base/actor.py
+++ b/applications/Chat/coati/models/base/actor.py
@@ -30,4 +30,3 @@ def forward(
         """Returns model output."""
         output = self.model(input_ids, attention_mask=attention_mask, **model_kwargs)
         return output
-    
diff --git a/applications/Chat/coati/ray/utils.py b/applications/Chat/coati/ray/utils.py
index b88140c0e036..4882f00b7eca 100644
--- a/applications/Chat/coati/ray/utils.py
+++ b/applications/Chat/coati/ray/utils.py
@@ -75,7 +75,9 @@ def get_strategy_from_args(strategy: str):
     elif strategy == "colossalai_zero2":
         strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
     elif strategy == "colossalai_gemini_cpu":
-        strategy_ = GeminiStrategy(placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5)
+        strategy_ = GeminiStrategy(
+            placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5
+        )
     elif strategy == "colossalai_zero2_cpu":
         strategy_ = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
     else:
diff --git a/applications/Chat/coati/trainer/strategies/ddp.py b/applications/Chat/coati/trainer/strategies/ddp.py
index f2a44aeb0961..3d3d44ca51ae 100644
--- a/applications/Chat/coati/trainer/strategies/ddp.py
+++ b/applications/Chat/coati/trainer/strategies/ddp.py
@@ -101,16 +101,17 @@ def save_pretrained(
 
         model_path = os.path.join(path, "pytorch_model.bin")
         self.save_model(model, model_path, shard=shard)
+
         def _replace_keys(model_path: str, replace_fn: Callable):
             state_dict = torch.load(model_path, map_location="cpu")
             state_dict = {replace_fn(k): v for k, v in state_dict.items()}
             torch.save(state_dict, model_path)
+
         # FIXME: save_model would add "model." prefix to keys of pytorch_model.bin
         # HACK: rename keys of pytorch_model.bin
         if dist.get_rank() == 0:
             _replace_keys(model_path, lambda k: k.replace("model.", "", 1))
 
-
     def get_model_state_dict_shard(self, model: nn.Module, **config):
         # TODO: implement sharding on naive strategy
         model = self.unwrap_model(model)
diff --git a/applications/Chat/examples/community/peft/train_peft_prompts.py b/applications/Chat/examples/community/peft/train_peft_prompts.py
index 1dd9ffcdf1cd..4625f2bda4de 100644
--- a/applications/Chat/examples/community/peft/train_peft_prompts.py
+++ b/applications/Chat/examples/community/peft/train_peft_prompts.py
@@ -24,7 +24,9 @@ def main(args):
     if args.strategy == "ddp":
         strategy = DDPStrategy()
     elif args.strategy == "colossalai_gemini":
-        strategy = GeminiStrategy(placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5)
+        strategy = GeminiStrategy(
+            placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5
+        )
     elif args.strategy == "colossalai_zero2":
         strategy = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
     else:
diff --git a/applications/Chat/examples/train_sft.sh b/applications/Chat/examples/train_sft.sh
index b7d176847d9c..0fb4da3d3ce8 100755
--- a/applications/Chat/examples/train_sft.sh
+++ b/applications/Chat/examples/train_sft.sh
@@ -25,4 +25,4 @@ torchrun --standalone --nproc_per_node=4 train_sft.py \
     --accumulation_steps 8 \
     --lr 2e-5 \
     --max_datasets_size 512 \
-    --max_epochs 1
\ No newline at end of file
+    --max_epochs 1
diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/model/init_model.py b/applications/Colossal-LLaMA-2/colossal_llama2/model/init_model.py
index 67e487f43b08..f61291f35d04 100644
--- a/applications/Colossal-LLaMA-2/colossal_llama2/model/init_model.py
+++ b/applications/Colossal-LLaMA-2/colossal_llama2/model/init_model.py
@@ -8,11 +8,10 @@
 
 import numpy as np
 import torch
-from transformers import LlamaTokenizer, LlamaForCausalLM
+from transformers import LlamaForCausalLM, LlamaTokenizer
 
 from colossalai.logging import get_dist_logger
 
-
 logger = get_dist_logger()
 
 
diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/utils/ckpt_io.py b/applications/Colossal-LLaMA-2/colossal_llama2/utils/ckpt_io.py
index 85decf37dd0b..05342ce41a60 100644
--- a/applications/Colossal-LLaMA-2/colossal_llama2/utils/ckpt_io.py
+++ b/applications/Colossal-LLaMA-2/colossal_llama2/utils/ckpt_io.py
@@ -10,8 +10,8 @@
 from typing import Any, Dict, Tuple, Union
 
 import torch
-from torch.optim.optimizer import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
+from torch.optim.optimizer import Optimizer
 
 from colossalai.booster import Booster
 from colossalai.cluster import DistCoordinator
diff --git a/applications/Colossal-LLaMA-2/colossal_llama2/utils/stream_chat_patch.py b/applications/Colossal-LLaMA-2/colossal_llama2/utils/stream_chat_patch.py
index 8f8eecb18eb0..44fa3678d621 100644
--- a/applications/Colossal-LLaMA-2/colossal_llama2/utils/stream_chat_patch.py
+++ b/applications/Colossal-LLaMA-2/colossal_llama2/utils/stream_chat_patch.py
@@ -1,20 +1,19 @@
 from copy import deepcopy
-from typing import Optional, List, Dict, Tuple, Callable, Any
+from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import torch
 from torch import nn
-
 from transformers import PreTrainedTokenizer
-from transformers.utils import logging
 from transformers.generation.utils import GenerationConfig, LogitsProcessorList, StoppingCriteriaList
-  
+from transformers.utils import logging
+
 logger = logging.get_logger(__name__)
 
 
 def get_prompt_template(
-    input_query:str, 
-    history:List[Dict]= None, 
-    roles:list = ["", "Human", "Assistant"],
+    input_query: str,
+    history: List[Dict] = None,
+    roles: list = ["", "Human", "Assistant"],
 ) -> str:
     """
     Generates a prompt template for chat models based on input and history.
@@ -32,7 +31,7 @@ def get_prompt_template(
         new_history = []
     else:
         new_history = deepcopy(history)
-    
+
     new_history.append({"role": roles[1], "message": input_query.strip()})
     new_history.append({"role": roles[2], "message": None})
 
@@ -48,22 +47,23 @@ def get_prompt_template(
                 prompt += f"{role}: <s>"
     return prompt
 
+
 @torch.inference_mode()
 def streaming_chat(
-    model: Any, 
+    model: Any,
     tokenizer: PreTrainedTokenizer,
-    input_query: str, 
-    history: List[Dict] = None, 
-    roles: list = ["", "Human", "Assistant"], 
-    past_key_values: Tuple[Tuple[torch.FloatTensor, Any], Any] = None, 
-    temperature: float = 0.8, 
-    top_p: float = 0.95, 
-    top_k: int = 50, 
-    do_sample: bool = True, 
+    input_query: str,
+    history: List[Dict] = None,
+    roles: list = ["", "Human", "Assistant"],
+    past_key_values: Tuple[Tuple[torch.FloatTensor, Any], Any] = None,
+    temperature: float = 0.8,
+    top_p: float = 0.95,
+    top_k: int = 50,
+    do_sample: bool = True,
     length_penalty: float = 1.2,
-    max_new_tokens: int = 512, 
-    logits_processor: LogitsProcessorList = None, 
-    return_past_key_values: bool = False, 
+    max_new_tokens: int = 512,
+    logits_processor: LogitsProcessorList = None,
+    return_past_key_values: bool = False,
     **kwargs,
 ):
     """
@@ -87,7 +87,7 @@ def streaming_chat(
         **kwargs: Additional keyword arguments for generation.
 
     Yields:
-        Tuple[str, List[Dict], Optional[Tuple[Tuple[torch.FloatTensor, Any], Any]]]: A tuple containing the generated response, updated history, and 
+        Tuple[str, List[Dict], Optional[Tuple[Tuple[torch.FloatTensor, Any], Any]]]: A tuple containing the generated response, updated history, and
         optionally the updated past key values if `return_past_key_values` is True.
 
     Ensures padding is on the left side for the tokenizer.
@@ -97,32 +97,37 @@ def streaming_chat(
         history = []
     if logits_processor is None:
         logits_processor = LogitsProcessorList()
-        
+
     generation_kwargs = {
-        'temperature': temperature,
-        'top_p': top_p,
-        'top_k': top_k,
-        'do_sample': do_sample,
-        'max_new_tokens': max_new_tokens,
-        'length_penalty': length_penalty,
-        'use_cache': True,
-        **kwargs
+        "temperature": temperature,
+        "top_p": top_p,
+        "top_k": top_k,
+        "do_sample": do_sample,
+        "max_new_tokens": max_new_tokens,
+        "length_penalty": length_penalty,
+        "use_cache": True,
+        **kwargs,
     }
 
     prompt_str = get_prompt_template(input_query, history=history, roles=roles)
-   
+
     eos_token_id = [tokenizer.eos_token_id]
     inputs = tokenizer(prompt_str, return_tensors="pt").to(model.device)
     history.append({"role": roles[1], "message": input_query.strip()})
     history.append({"role": roles[2], "message": None})
 
-    for outputs in stream_generate(model, **inputs, past_key_values=past_key_values,
-                            eos_token_id=eos_token_id, return_past_key_values=return_past_key_values,
-                            **generation_kwargs):
+    for outputs in stream_generate(
+        model,
+        **inputs,
+        past_key_values=past_key_values,
+        eos_token_id=eos_token_id,
+        return_past_key_values=return_past_key_values,
+        **generation_kwargs,
+    ):
         if return_past_key_values:
             outputs, past_key_values = outputs
 
-        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
+        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]) : -1]
         response = tokenizer.decode(outputs)
 
         history[-1]["message"] = response.strip()
@@ -130,30 +135,30 @@ def streaming_chat(
             yield response, history, past_key_values
         else:
             yield response, history
-                    
+
 
 @torch.inference_mode()
 def stream_generate(
-    model: Any, 
-    input_ids: torch.Tensor, 
+    model: Any,
+    input_ids: torch.Tensor,
     generation_config: Optional[GenerationConfig] = None,
     logits_processor: Optional[LogitsProcessorList] = None,
     stopping_criteria: Optional[StoppingCriteriaList] = None,
     prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
-    return_past_key_values: bool = False, 
+    return_past_key_values: bool = False,
     **kwargs,
 ):
     """
     Generates sequences of token ids using the specified model and generation parameters.
     Adapted from https://huggingface.co/THUDM/chatglm3-6b/blob/main/modeling_chatglm.py
-    
+
     Args:
         model (Any): The model used for generating sequences of token ids.
-        input_ids (torch.Tensor): The sequence used as a prompt for the generation or as model inputs to the encoder. 
+        input_ids (torch.Tensor): The sequence used as a prompt for the generation or as model inputs to the encoder.
         generation_config (Optional[GenerationConfig]): The generation configuration to be used as base parametrization for the generation call.
         logits_processor (Optional[LogitsProcessorList]): Custom logits processors that complement the default logits processors built from arguments
         and generation config.
-        stopping_criteria (Optional[StoppingCriteriaList]): Custom stopping criteria that complement the default stopping criteria built from arguments 
+        stopping_criteria (Optional[StoppingCriteriaList]): Custom stopping criteria that complement the default stopping criteria built from arguments
         and a generation config.
         prefix_allowed_tokens_fn (Optional[Callable[[int, torch.Tensor], List[int]]]): Function to constrain token generation.
         return_past_key_values (bool): Whether to return past key values for further incremental decoding, defaults to False.
@@ -169,7 +174,7 @@ def stream_generate(
         generation_config = model.generation_config
     generation_config = deepcopy(generation_config)
     model_kwargs = generation_config.update(**kwargs)
-    
+
     eos_token_id = generation_config.eos_token_id
     if isinstance(eos_token_id, int):
         eos_token_id = [eos_token_id]
@@ -177,25 +182,25 @@ def stream_generate(
 
     if generation_config.max_new_tokens is not None:
         generation_config.max_length = generation_config.max_new_tokens + input_ids_len
-    
+
     if input_ids_len >= generation_config.max_length:
         input_ids_string = "decoder_input_ids" if model.config.is_encoder_decoder else "input_ids"
         logger.warning(
-                    f"Input length of {input_ids_string} is {input_ids_len}, but `max_length` is set to"
-                    f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
-                    " increasing `max_new_tokens`."
-                )
+            f"Input length of {input_ids_string} is {input_ids_len}, but `max_length` is set to"
+            f" {generation_config.max_length}. This can lead to unexpected behavior. You should consider"
+            " increasing `max_new_tokens`."
+        )
     logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
     stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
-    
+
     # prepare distribution pre_processing samplers
     logits_processor = model._get_logits_processor(
-            generation_config=generation_config,
-            input_ids_seq_length=input_ids_len,
-            encoder_input_ids=input_ids,
-            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
-            logits_processor=logits_processor,
-        )
+        generation_config=generation_config,
+        input_ids_seq_length=input_ids_len,
+        encoder_input_ids=input_ids,
+        prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+        logits_processor=logits_processor,
+    )
 
     # prepare stopping criteria
     stopping_criteria = model._get_stopping_criteria(
@@ -205,7 +210,7 @@ def stream_generate(
     logits_warper = model._get_logits_warper(generation_config)
     unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
     scores = None
-    
+
     while True:
         model_inputs = model.prepare_inputs_for_generation(input_ids, **model_kwargs)
         # forward pass to get next token
@@ -244,4 +249,4 @@ def stream_generate(
             yield input_ids
         # stop when each sentence is finished, or if exceed the maximum length
         if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
-            break
\ No newline at end of file
+            break
diff --git a/applications/Colossal-LLaMA-2/docs/example_13b.md b/applications/Colossal-LLaMA-2/docs/example_13b.md
index 3854aaa6a933..5305a999ba71 100644
--- a/applications/Colossal-LLaMA-2/docs/example_13b.md
+++ b/applications/Colossal-LLaMA-2/docs/example_13b.md
@@ -43,7 +43,7 @@ if __name__ == '__main__':
     model.to(device)
 
     tokenizer = AutoTokenizer.from_pretrained("hpcai-tech/Colossal-LLaMA-2-13b-base")
-    
+
     question = "xxx"  # Your question.
     prompt = (f"请对下面的问题，给出准确、可靠、安全的回答\n"
               f"{question}"
@@ -111,7 +111,7 @@ The Colossal-AI team has conducted comprehensive evaluation for safety and value
 | 整车热管理主要包括什么内容 | 整车热管理主要包括：1、发动机冷却系统；2、散热系统；3、空调冷却系统；4、采暖系统；5、动力转向器冷却系统；6、润滑油冷却系统；7、变速箱冷却系统；8、燃油冷却系统；9、冷却系统排放液系统。 |
 | 简述下汽车空调-热泵循环系统的原理 | 汽车空调系统主要有三种形式，其中热泵循环系统是其中的一种。其原理是利用发动机排放的热量，通过压缩机将冷媒压缩成高温高压的状态，然后通过冷却器释放到空气中，吸收周围的热量，再通过膨胀阀释放到蒸发器中，蒸发器在压缩机的作用下将冷媒压缩成高温高压的状态，再次释放到空气中，从而实现制冷的效果。 |
 
-#### Finance 
+#### Finance
 | Question | <center>Colossal-LLaMA-2-13b-base</center> |
 | :------: | :----------------------- |
 | 如何看待当前提前还贷热 | 提前还贷是一种常见的还贷方式，在市场中存在着许多的因素和影响，需要具体分析当前的经济形势和人们心理状况的影响。提前还贷可以帮助借款人早日摆脱债务负担，但也存在一些风险和不确定性。从经济形势来看，当前利率下探、还款压力减轻等因素可能会导致一部分借款人提前还贷。此外，一些购房者可能会受到市场不景气的影响，担心未来房产价值会下降，因此提前还贷以避免资产贬值。从借款人心理状况来看，提前还贷也可能会由于借款人对未来经济预期的不确定性而产生压力和不满。因此，提前还贷并不能一概而论，需要根据具体情况进行分析。如果提前还贷对借款人的经济情况和未来规划造成不利影响，建议谨慎考虑是否要提前还贷。|
@@ -150,4 +150,4 @@ The Colossal-AI team has conducted comprehensive evaluation for safety and value
 
 
 ## Conclusion
-The Colossal-AI team's advanced 13B model, compared to the 7B version, features a refined data structure categorizing information into informative, functional, and memory replay data. Informative data is intricately subdivided into major categories, each further segmented for precise control. Concurrently, data scale across domains is expanded. Tailored enhancements meet community demands for large model capabilities in natural language processing tasks, ensuring proficiency during pre-training and cost-effective fine-tuning. Addressing security and values concerns, multidimensional controls are implemented, securing the baseline model and aligning it with correct values.
\ No newline at end of file
+The Colossal-AI team's advanced 13B model, compared to the 7B version, features a refined data structure categorizing information into informative, functional, and memory replay data. Informative data is intricately subdivided into major categories, each further segmented for precise control. Concurrently, data scale across domains is expanded. Tailored enhancements meet community demands for large model capabilities in natural language processing tasks, ensuring proficiency during pre-training and cost-effective fine-tuning. Addressing security and values concerns, multidimensional controls are implemented, securing the baseline model and aligning it with correct values.
diff --git a/applications/Colossal-LLaMA-2/docs/example_7b.md b/applications/Colossal-LLaMA-2/docs/example_7b.md
index d889ab4165d0..d833d28218b5 100644
--- a/applications/Colossal-LLaMA-2/docs/example_7b.md
+++ b/applications/Colossal-LLaMA-2/docs/example_7b.md
@@ -242,4 +242,4 @@ To comprehensively assess the performance of the Colossal-LLaMA-2-7B-base model,
 ## Conclusion
 In general, the Colossal-LLaMA-2-7B-base model not only enhances its understanding of English but also exhibits significant improvements in its comprehension of Chinese. It boasts a broad spectrum of general knowledge, encompassing various fields such as food, sports, technology, literature, games, and more. Regarding text generation tasks, the Colossal-LLaMA-2-7B-base model excels in writing performance; however, its ability to generate specific formats like code, emails, tables, etc., needs enhancement due to the scarcity of relevant training data during our training phase. When compared to the Qwen-7b-base model, the Colossal-LLaMA-2-7B-base model outperforms it in answering most English questions and some Chinese questions, as demonstrated in the examples above.
 
-Presently, the Colossal-LLaMA-2-7B-base model already exhibits some capabilities in sentiment analysis, logical reasoning, information extraction, role-play, classification, and rewriting. These capabilities are poised for further improvement in the future as part of our ongoing enhancements.
\ No newline at end of file
+Presently, the Colossal-LLaMA-2-7B-base model already exhibits some capabilities in sentiment analysis, logical reasoning, information extraction, role-play, classification, and rewriting. These capabilities are poised for further improvement in the future as part of our ongoing enhancements.
diff --git a/applications/Colossal-LLaMA-2/hostfile.example b/applications/Colossal-LLaMA-2/hostfile.example
index 82948648cbc9..cfaaa0ef559f 100644
--- a/applications/Colossal-LLaMA-2/hostfile.example
+++ b/applications/Colossal-LLaMA-2/hostfile.example
@@ -1,2 +1,2 @@
 hostname1
-hostname2
\ No newline at end of file
+hostname2
diff --git a/applications/Colossal-LLaMA-2/inference_example.py b/applications/Colossal-LLaMA-2/inference_example.py
index 63ce91e50432..8d301616d678 100644
--- a/applications/Colossal-LLaMA-2/inference_example.py
+++ b/applications/Colossal-LLaMA-2/inference_example.py
@@ -15,7 +15,7 @@ def load_model(model_path, device="cuda", **kwargs):
     model.to(device)
 
     try:
-        tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side='left')
+        tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="left")
     except OSError:
         raise ImportError("Tokenizer not found. Please check if the tokenizer exists or the model path is correct.")
 
diff --git a/applications/Colossal-LLaMA-2/requirements.txt b/applications/Colossal-LLaMA-2/requirements.txt
index 34afaf7e5cfd..5cdb8e7f3348 100644
--- a/applications/Colossal-LLaMA-2/requirements.txt
+++ b/applications/Colossal-LLaMA-2/requirements.txt
@@ -12,4 +12,3 @@ flash-attn>=2.0.0,<=2.0.5
 tqdm
 sentencepiece==0.1.99
 protobuf<=3.20.0
-
diff --git a/applications/Colossal-LLaMA-2/stream_chat_example.py b/applications/Colossal-LLaMA-2/stream_chat_example.py
index 3e45c690f878..4c0d1fe2a35f 100644
--- a/applications/Colossal-LLaMA-2/stream_chat_example.py
+++ b/applications/Colossal-LLaMA-2/stream_chat_example.py
@@ -1,11 +1,11 @@
-import os
 import argparse
 
-from transformers import AutoTokenizer, AutoModelForCausalLM
 from colossal_llama2.utils.stream_chat_patch import streaming_chat
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 SYSTEM = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
 
+
 def main(args):
     model = AutoModelForCausalLM.from_pretrained(args.model_path).cuda().eval()
     tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path)
@@ -27,29 +27,34 @@ def main(args):
         print(f"\n{roles[2]}: ", end="")
         gen_len = 0
         for response, history, past_key_values in streaming_chat(
-            model, tokenizer, input_query, history=history, roles=roles,
-            temperature = args.temperature,
-            top_p = args.top_p,
-            top_k = args.top_k,
-            do_sample = args.do_sample,
-            length_penalty = args.length_penalty,
-            max_new_tokens = args.max_new_tokens,
+            model,
+            tokenizer,
+            input_query,
+            history=history,
+            roles=roles,
+            temperature=args.temperature,
+            top_p=args.top_p,
+            top_k=args.top_k,
+            do_sample=args.do_sample,
+            length_penalty=args.length_penalty,
+            max_new_tokens=args.max_new_tokens,
             past_key_values=past_key_values,
-            return_past_key_values=True):
-
+            return_past_key_values=True,
+        ):
             output = response[gen_len:]
             print(output, end="", flush=True)
             gen_len = len(response)
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--model_path', type=str, default=None, help="path to chat version model")
-    parser.add_argument('--tokenizer_path', type=str, default=None, help="path to chat version tokenizer")
-    parser.add_argument('--temperature', type=float, default=0.8, help="set temperature")
-    parser.add_argument('--top_p', type=float, default=0.95, help="set top p value")
-    parser.add_argument('--top_k', type=int, default=50, help="set top k value")
-    parser.add_argument('--do_sample', type=bool, default=True, help="whether turn on do_sample or not")
-    parser.add_argument('--length_penalty', type=float, default=1.2, help="set length penalty")
-    parser.add_argument('--max_new_tokens', type=int, default=512, help="set max new tokens")
+    parser.add_argument("--model_path", type=str, default=None, help="path to chat version model")
+    parser.add_argument("--tokenizer_path", type=str, default=None, help="path to chat version tokenizer")
+    parser.add_argument("--temperature", type=float, default=0.8, help="set temperature")
+    parser.add_argument("--top_p", type=float, default=0.95, help="set top p value")
+    parser.add_argument("--top_k", type=int, default=50, help="set top k value")
+    parser.add_argument("--do_sample", type=bool, default=True, help="whether turn on do_sample or not")
+    parser.add_argument("--length_penalty", type=float, default=1.2, help="set length penalty")
+    parser.add_argument("--max_new_tokens", type=int, default=512, help="set max new tokens")
     args = parser.parse_args()
-    main(args)
\ No newline at end of file
+    main(args)
diff --git a/applications/Colossal-LLaMA-2/version.txt b/applications/Colossal-LLaMA-2/version.txt
index 8a9ecc2ea99d..8acdd82b765e 100644
--- a/applications/Colossal-LLaMA-2/version.txt
+++ b/applications/Colossal-LLaMA-2/version.txt
@@ -1 +1 @@
-0.0.1
\ No newline at end of file
+0.0.1
diff --git a/applications/ColossalEval/examples/dataset_evaluation/inference.py b/applications/ColossalEval/examples/dataset_evaluation/inference.py
index a340f3bfd281..13bbb12b6990 100644
--- a/applications/ColossalEval/examples/dataset_evaluation/inference.py
+++ b/applications/ColossalEval/examples/dataset_evaluation/inference.py
@@ -3,7 +3,6 @@
 import os
 from typing import Dict, List
 
-import torch
 import torch.distributed as dist
 from colossal_eval import dataset, models, utils
 
diff --git a/applications/ColossalMoE/README.md b/applications/ColossalMoE/README.md
index be50a8f9f25111edabd7c13660bb976f9cc1f252..ba864d1dff8b2c52b3a5c45261f37586ecdb5bc1 100644
GIT binary patch
delta 9
QcmX?QblPZxmn0(>02EsT7ytkO

delta 7
OcmX?YbjoOhmm~lVHv<g-

diff --git a/applications/ColossalMoE/infer.py b/applications/ColossalMoE/infer.py
index 104d019746c8..9def6b860abe 100644
--- a/applications/ColossalMoE/infer.py
+++ b/applications/ColossalMoE/infer.py
@@ -2,7 +2,6 @@
 
 import torch
 import torch.distributed as dist
-from mixtral_checkpoint import MixtralMoEHybridParallelCheckpointIO
 from transformers import AutoTokenizer
 from transformers.models.mixtral import MixtralConfig, MixtralForCausalLM
 
@@ -10,6 +9,7 @@
 from colossalai.booster import Booster
 from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
 from colossalai.cluster import DistCoordinator
+from colossalai.moe.checkpoint import MoECheckpointIO
 from colossalai.shardformer.policies.mixtral import MixtralForCausalLMPolicy
 
 
@@ -71,7 +71,7 @@ def main():
             zero_stage=1,
             precision=args.precision,
             custom_policy=MixtralForCausalLMPolicy(),
-            checkpoint_io=MixtralMoEHybridParallelCheckpointIO,
+            checkpoint_io=MoECheckpointIO,
             enable_fused_normalization=args.use_layernorm_kernel,
             enable_jit_fused=args.use_kernel,
         )
diff --git a/applications/ColossalMoE/mixtral_checkpoint.py b/applications/ColossalMoE/mixtral_checkpoint.py
deleted file mode 100644
index d08dfd5f8120..000000000000
--- a/applications/ColossalMoE/mixtral_checkpoint.py
+++ /dev/null
@@ -1,629 +0,0 @@
-import copy
-import logging
-import os
-from pathlib import Path
-from shutil import rmtree
-from typing import Dict, Iterator, Optional, OrderedDict, Tuple
-
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-from torch.distributed import ProcessGroup
-
-from colossalai.checkpoint_io import CheckpointIndexFile
-from colossalai.checkpoint_io.hybrid_parallel_checkpoint_io import HybridParallelCheckpointIO
-from colossalai.checkpoint_io.index_file import CheckpointIndexFile
-from colossalai.checkpoint_io.utils import (
-    StateDictSharder,
-    gather_distributed_param,
-    get_model_base_filenames,
-    get_optimizer_base_filenames,
-    load_shard_state_dict,
-    load_states_into_optimizer,
-    save_config_file,
-    save_param_groups,
-    save_state_dict_shards,
-    search_tp_partition_dim,
-    sharded_optimizer_loading_epilogue,
-)
-from colossalai.interface import ModelWrapper, OptimizerWrapper
-from colossalai.moe import MOE_MANAGER
-from colossalai.tensor.moe_tensor.api import is_moe_tensor
-
-try:
-    from torch.nn.modules.module import _EXTRA_STATE_KEY_SUFFIX
-except ImportError:
-    _EXTRA_STATE_KEY_SUFFIX = "_extra_state"
-
-
-class MixtralMoEHybridParallelCheckpointIO(HybridParallelCheckpointIO):
-    def __init__(
-        self,
-        dp_group: ProcessGroup,
-        pp_group: ProcessGroup,
-        tp_group: ProcessGroup,
-        zero_stage: int,
-        verbose: bool = True,
-    ) -> None:
-        super().__init__(dp_group, pp_group, tp_group, zero_stage, verbose)
-        moe_info = MOE_MANAGER.parallel_info_dict[MOE_MANAGER.ep_size]
-        self.ep_group = moe_info.ep_group
-        self.ep_size = moe_info.ep_size
-        self.ep_rank = moe_info.ep_rank
-        self.real_dp_rank = moe_info.dp_rank
-
-    @staticmethod
-    def _model_sharder(
-        model: nn.Module,
-        prefix: str = "",
-        keep_vars: bool = False,
-        size_per_shard: int = 1024,
-        param_name_pattern: Optional[str] = None,
-    ) -> Iterator[Tuple[OrderedDict, int]]:
-        # An internel method that breaks state_dict of model into shards within limited size.
-
-        state_dict_sharder = StateDictSharder(size_per_shard)
-
-        # Save parameters.
-        for name, param in model.named_parameters():
-            if param is None:
-                continue
-            if param_name_pattern is not None and param_name_pattern not in name:
-                continue
-            # Gather tensor pieces when using tensor parallel.
-            param_ = gather_distributed_param(param, keep_vars=False)
-            block, block_size = state_dict_sharder.append_param(prefix + name, param_)
-            if block is not None:
-                yield block, block_size
-
-        # Save buffers.
-        for name, buf in model.named_buffers():
-            if buf is not None and name not in model._non_persistent_buffers_set:
-                buffer = buf if keep_vars else buf.detach()
-                block, block_size = state_dict_sharder.append_param(prefix + name, buffer)
-                if block is not None:
-                    yield block, block_size
-
-        # Save extra states.
-        extra_state_key = prefix + _EXTRA_STATE_KEY_SUFFIX
-        if (
-            getattr(model.__class__, "get_extra_state", torch.nn.Module.get_extra_state)
-            is not torch.nn.Module.get_extra_state
-        ):
-            extra_state = model.get_extra_state()
-            block, block_size = state_dict_sharder.append_param(extra_state_key, extra_state)
-            if block is not None:
-                yield block, block_size
-
-        # Return the last block in sharder.
-        yield state_dict_sharder.current_block, state_dict_sharder.current_block_size
-
-    def save_sharded_model(
-        self,
-        model: ModelWrapper,
-        checkpoint: str,
-        gather_dtensor: bool = True,
-        prefix: Optional[str] = None,
-        size_per_shard: int = 1024,
-        use_safetensors: bool = False,
-    ) -> None:
-        """
-        Save sharded model checkpoint under the given checkpointing path.
-        The following files will be created under the path:
-        - An index file (pytorch_model.bin.index.json) containing a map between model params/buffers and file names.
-        - Multiple files that store state tensors of models.
-          If pipeline parallelism is used, the filenames are in the form of "pytorch_model.<prefix>-stage-000XX-shard-000XX.bin".
-          If pipeline parallelism is not used, "pytorch_model.<prefix>-000XX.bin"
-
-
-        Args:
-            model (nn.Module): Model on local device to be saved.
-            checkpoint (str): Checkpointing path which should be a directory path.
-            gather_dtensor (bool, optional): Whether to gather_dtensor, currently not used. Defaults to True.
-            prefix (str, optional): Perfix of file to save. Defaults to None.
-            size_per_shard (int, optional): Size per shard in MB. Defaults to 1024.
-            use_safetensors (bool, optional): Whether to use safe tensors. Defaults to False.
-        """
-
-        assert isinstance(model, ModelWrapper), "Please boost the model before saving!"
-        model = model.unwrap()
-
-        if os.path.isfile(checkpoint):
-            logging.error(f"Provided path ({checkpoint}) should be a directory, not a file")
-            return
-
-        Path(checkpoint).mkdir(parents=True, exist_ok=True)
-
-        if self.real_dp_rank != 0:
-            dist.barrier()
-            return
-
-        # ep_rank 0 saves all the parameters and buffers.
-        # other ep_ranks save only experts
-        ep_param_pattern = "experts." if self.ep_rank != 0 else None
-
-        # Then collect the sharded parameters & buffers along tp_group.
-        # Only devices with tp_rank == 0 are responsible for model saving.
-        state_dict_shard = MixtralMoEHybridParallelCheckpointIO._model_sharder(
-            model, size_per_shard=size_per_shard, param_name_pattern=ep_param_pattern
-        )
-        weights_name, save_index_file = get_model_base_filenames(prefix, use_safetensors)
-        index_file = CheckpointIndexFile(checkpoint)
-        control_saving = self.tp_rank == 0
-
-        if self.pp_size == 1 and self.ep_size == 1:
-            # When pipeline is not used, save the model shards as in general checkpointIO
-            total_size = save_state_dict_shards(
-                sharded_state_dict=state_dict_shard,
-                checkpoint=checkpoint,
-                index_file=index_file,
-                base_filename=weights_name,
-                is_master=control_saving,
-                use_safetensors=use_safetensors,
-            )
-            if control_saving:
-                index_file.append_meta_data("total_size", total_size)
-                index_file.write_index_file(save_index_file)
-                save_config_file(model, checkpoint)
-                if self.verbose and self.coordinator.is_master():
-                    logging.info(
-                        f"The model is split into checkpoint shards. "
-                        f"You can find where each parameters has been saved in the "
-                        f"index located at {save_index_file}."
-                    )
-
-            dist.barrier()
-        else:
-            # When pipeline is used, each stage produces its own shard files and index files.
-            # Index files belonging to each stage are saved under a temporary folder ./tmp_index_files/
-            # After all the state_dicts have been saved, the master rank integrates all the index files into one final index file and deletes the tmp folder.
-
-            final_index_file_path = copy.deepcopy(save_index_file)
-            tmp_index_file_folder = os.path.join(checkpoint, "tmp_index_files")
-            Path(tmp_index_file_folder).mkdir(parents=True, exist_ok=True)
-
-            # Manage filenames of sharded weights and index file for each pipeline stage.
-            weights_name = weights_name.replace(".bin", f"-stage-{self.pp_rank+1:05d}-{self.ep_rank+1:05d}-shard.bin")
-            weights_name = weights_name.replace(
-                ".safetensors", f"-stage-{self.pp_rank+1:05d}-{self.ep_rank+1:05d}-shard.safetensors"
-            )
-            save_index_file = save_index_file.replace(".json", f"-stage-{self.pp_rank+1:05d}-{self.ep_rank+1:05d}.json")
-            save_index_file = os.path.join("tmp_index_files", save_index_file)
-
-            total_size = save_state_dict_shards(
-                sharded_state_dict=state_dict_shard,
-                checkpoint=checkpoint,
-                index_file=index_file,
-                base_filename=weights_name,
-                is_master=control_saving,
-                use_safetensors=use_safetensors,
-                use_pp_format=True,
-            )
-            if control_saving:
-                index_file.append_meta_data("total_size", total_size)
-                index_file.write_index_file(save_index_file)
-            else:
-                dist.barrier()
-                return
-
-            dist.barrier()
-
-            # The global master rank integrates the index files and clean the folder.
-            if self.coordinator.is_master():
-                final_index_file = CheckpointIndexFile(checkpoint)
-                final_index_file.append_meta_data("total_size", 0)
-
-                for filename in os.listdir(tmp_index_file_folder):
-                    stage_index_file = CheckpointIndexFile.from_file(os.path.join(tmp_index_file_folder, filename))
-                    final_index_file.metadata["total_size"] += stage_index_file.metadata["total_size"]
-                    for weight, weight_filename in stage_index_file.weight_map.items():
-                        final_index_file.append_weight_map(weight, weight_filename)
-
-                final_index_file.write_index_file(final_index_file_path)
-                save_config_file(model, checkpoint)
-                rmtree(tmp_index_file_folder)
-                if self.verbose and self.coordinator.is_master():
-                    logging.info(
-                        f"The model is split into checkpoint shards. "
-                        f"You can find where each parameters has been saved in the "
-                        f"index located at {final_index_file_path}."
-                    )
-
-    @staticmethod
-    def gather_from_sharded_optimizer_state(
-        state: OrderedDict,
-        param: torch.Tensor,
-        original_shape: torch.Size,
-        dp_group: ProcessGroup,
-        tp_group: ProcessGroup,
-        use_zero: bool,
-        inplace: bool,
-        is_moe_param: bool,
-        device: torch.device = torch.device("cpu"),
-    ) -> OrderedDict:
-        """
-        With given parameter and its optimizer states, gather the complete optimizer state for saving.
-
-        Args:
-            state (OrderedDict): Optimizer states of given parameter, might be distributed among tp/dp group if using TP/Zero.
-            param (torch.Tensor): The given parameter. It should be working_param when using Zero.
-            original_shape (torch.Size): The size of parameter before sharding.
-            dp_group (ProcessGroup): The process group of data parallel.
-            tp_group (ProcessGroup): The process group of tensor parallel.
-            use_zero (bool): Whether Zero is used.
-            inplace (bool): If set to True, will update the values of argument 'state' in place. Else will make a copy of state.
-            device (torch.device): The destination device of loaded optimizer states. Defaults to torch.device('cpu').
-
-        Returns:
-            OrderedDict: The complete optimizer state of given parameter.
-        """
-        dp_size = dist.get_world_size(dp_group)
-        tp_size = dist.get_world_size(tp_group)
-        current_shape = param.shape
-        state_ = state if inplace else copy.deepcopy(state)
-
-        for k, v in state_.items():
-            if isinstance(v, torch.Tensor) and k != "step":
-                # First gather Zero shards.
-                if use_zero and not is_moe_param:
-                    v = v.cuda()
-                    gather_tensor = [torch.zeros_like(v) for _ in range(dp_size)]
-                    dist.all_gather(gather_tensor, v, group=dp_group)
-                    v = torch.stack(gather_tensor).view(-1)[: param.numel()].reshape_as(param)
-
-                # Then gather TP shards.
-                partition_dim = search_tp_partition_dim(current_shape, original_shape, tp_size)
-                if partition_dim is not None:
-                    gather_tensor = [torch.zeros_like(v) for _ in range(tp_size)]
-                    dist.all_gather(gather_tensor, v, group=tp_group)
-                    v = torch.cat(gather_tensor, dim=partition_dim)
-
-                state_[k] = v.detach().clone().to(device)
-
-        return state_
-
-    @staticmethod
-    def _optimizer_sharder(
-        optimizer: OptimizerWrapper,
-        use_zero: bool,
-        dp_group: ProcessGroup,
-        tp_group: ProcessGroup,
-        size_per_shard: int = 1024,
-        only_moe_param: bool = False,
-    ):
-        # An internel method that breaks state_dict of optimizer into shards within limited size.
-
-        state_dict_sharder = StateDictSharder(size_per_shard)
-        param_info = optimizer.param_info
-        master_to_working_map = optimizer.get_master_to_working_map()
-
-        for param, state in optimizer.optim.state.items():
-            if param is None:
-                continue
-
-            if master_to_working_map is not None:
-                working_param = master_to_working_map[id(param)]
-            else:
-                working_param = param
-
-            param_id = param_info["param2id"][id(working_param)]
-            original_shape = param_info["param2shape"][id(working_param)]
-            state_ = MixtralMoEHybridParallelCheckpointIO.gather_from_sharded_optimizer_state(
-                state,
-                working_param,
-                original_shape=original_shape,
-                dp_group=dp_group,
-                tp_group=tp_group,
-                use_zero=use_zero,
-                inplace=False,
-                is_moe_param=is_moe_tensor(working_param),
-            )
-
-            if only_moe_param and not is_moe_tensor(working_param):
-                continue
-            block, block_size = state_dict_sharder.append_optim_state(param_id, state_)
-            if block is not None:
-                yield block, block_size
-
-        # Return the last block in sharder.
-        yield state_dict_sharder.current_block, state_dict_sharder.current_block_size
-
-    def save_sharded_optimizer(
-        self,
-        optimizer: OptimizerWrapper,
-        checkpoint: str,
-        gather_dtensor: bool = True,
-        prefix: Optional[str] = None,
-        size_per_shard: int = 1024,
-    ):
-        """
-        Save sharded optimizer checkpoint under the given checkpointing path.
-        The following files will be created under the path:
-        - An index file (pytorch_optim.bin.index.json) containing a map between optimizer states and file names
-        - A group file (pytorch_optim_group.bin) recording information of param_groups
-        - Multiple files that store state tensors of optimizers.
-          If pipeline parallelism is used, the filenames are in the form of "pytorch_optim.<prefix>-stage-000XX-shard-000XX.bin".
-          If pipeline parallelism is not used, "pytorch_optim.<prefix>-000XX.bin"
-
-        Args:
-            optimizer (OptimizerWrapper): Optimizer to save sharded state_dict
-            checkpoint (str): Path to save optimizer state_dict
-            gather_dtensor (bool): Whether to gather_dtensor, not used
-            prefix (str): Perfix of file to save
-            size_per_shard (int): Max file size of each file shard that store state tensors
-        """
-        assert isinstance(optimizer, OptimizerWrapper), "Please boost the optimizer before saving!"
-        if os.path.isfile(checkpoint):
-            logging.error(f"Provided path ({checkpoint}) should be a directory, not a file")
-            return
-
-        Path(checkpoint).mkdir(parents=True, exist_ok=True)
-
-        # Devices along the same dp_group share the same copies of states when zero is not used.
-        # In this case only let the device with dp_rank == 0 save the model.
-        if not self.use_zero and self.real_dp_rank != 0:
-            dist.barrier()
-            return
-
-        # Then collect the sharded states along dp_group(if using zero)/tp_group.
-        # Only devices with (dp_rank == 0 and tp_rank == 0) are responsible for states saving.
-        state_dict_shard = MixtralMoEHybridParallelCheckpointIO._optimizer_sharder(
-            optimizer,
-            use_zero=self.use_zero,
-            dp_group=self.dp_group,
-            tp_group=self.tp_group,
-            size_per_shard=size_per_shard,
-            only_moe_param=self.ep_rank != 0,
-        )
-        states_name, save_index_file, param_group_file = get_optimizer_base_filenames(prefix)
-        index_file = CheckpointIndexFile(checkpoint)
-        control_saving = self.real_dp_rank == 0 and self.tp_rank == 0
-
-        if self.pp_size == 1 and self.ep_size == 1:
-            # When pipeline is not used, save the optimizer shards as in general checkpointIO
-            total_size = save_state_dict_shards(
-                sharded_state_dict=state_dict_shard,
-                checkpoint=checkpoint,
-                index_file=index_file,
-                base_filename=states_name,
-                is_master=control_saving,
-            )
-
-            if control_saving:
-                # Store param groups.
-                index_file.append_meta_data("param_groups", param_group_file)
-                group_file_path = os.path.join(checkpoint, param_group_file)
-                param_groups = [
-                    {**group, "params": group_info["params"]}
-                    for group, group_info in zip(optimizer.param_groups, optimizer.param_info["param_groups"])
-                ]
-                save_param_groups({"param_groups": param_groups}, group_file_path)
-                # Store index file.
-                index_file.append_meta_data("total_size", total_size)
-                index_file.write_index_file(save_index_file)
-                if self.verbose and self.coordinator.is_master():
-                    logging.info(
-                        f"The optimizer is going to be split to checkpoint shards. "
-                        f"You can find where each parameters has been saved in the "
-                        f"index located at {save_index_file}."
-                    )
-
-            dist.barrier()
-        else:
-            # When pipeline is used, each stage produces its own shard files and index files.
-            # Index files belonging to each stage are saved under a temporary folder ./tmp_index_files/
-            # After all the state_dicts have been saved, the master rank integrates all the index files into one final index file and deletes the tmp folder.
-
-            final_index_file_path = copy.deepcopy(save_index_file)
-            tmp_index_file_folder = os.path.join(checkpoint, "tmp_index_files")
-            Path(tmp_index_file_folder).mkdir(parents=True, exist_ok=True)
-
-            # Manage filenames of sharded weights and index file for each pipeline stage.
-            states_name = states_name.replace(".bin", f"-stage-{self.pp_rank+1:05d}-{self.ep_rank+1:05d}-shard.bin")
-            save_index_file = save_index_file.replace(".json", f"-stage-{self.pp_rank+1:05d}-{self.ep_rank+1:05d}.json")
-            save_index_file = os.path.join("tmp_index_files", save_index_file)
-
-            total_size = save_state_dict_shards(
-                sharded_state_dict=state_dict_shard,
-                checkpoint=checkpoint,
-                index_file=index_file,
-                base_filename=states_name,
-                is_master=control_saving,
-                use_pp_format=True,
-            )
-
-            if control_saving:
-                index_file.append_meta_data("total_size", total_size)
-                index_file.write_index_file(save_index_file)
-            else:
-                dist.barrier()
-                return
-
-            dist.barrier()
-
-            # The global master rank integrates the index files and clean the folder.
-            if self.coordinator.is_master():
-                final_index_file = CheckpointIndexFile(checkpoint)
-                final_index_file.append_meta_data("total_size", 0)
-
-                for filename in os.listdir(tmp_index_file_folder):
-                    stage_index_file = CheckpointIndexFile.from_file(os.path.join(tmp_index_file_folder, filename))
-                    final_index_file.metadata["total_size"] += stage_index_file.metadata["total_size"]
-                    for param_id, state_filename in stage_index_file.weight_map.items():
-                        final_index_file.append_weight_map(param_id, state_filename)
-
-                # Store param groups.
-                final_index_file.append_meta_data("param_groups", param_group_file)
-                group_file_path = os.path.join(checkpoint, param_group_file)
-                param_groups = [
-                    {**group, "params": group_info["params"]}
-                    for group, group_info in zip(optimizer.param_groups, optimizer.param_info["param_groups"])
-                ]
-                save_param_groups({"param_groups": param_groups}, group_file_path)
-
-                final_index_file.write_index_file(final_index_file_path)
-                rmtree(tmp_index_file_folder)
-
-                if self.verbose and self.coordinator.is_master():
-                    logging.info(
-                        f"The model is split into checkpoint shards. "
-                        f"You can find where each parameters has been saved in the "
-                        f"index located at {final_index_file_path}."
-                    )
-
-    def load_sharded_optimizer(self, optimizer: OptimizerWrapper, checkpoint_index_file: str, prefix: str = ""):
-        """
-        Load sharded optimizer with the given path to index file of checkpoint folder.
-
-        Args:
-            optimizer (OptimizerWrapper): The optimizer to be loaded.
-            checkpoint_index_file (str): Path to the index file of checkpointing folder.
-            prefix (str): Not used.
-        """
-        assert isinstance(optimizer, OptimizerWrapper), "Please boost the optimizer before loading!"
-
-        def _get_param_id_from_optimizer_param(
-            param: torch.Tensor, master_to_working_map: Optional[Dict[int, torch.Tensor]] = None
-        ):
-            if master_to_working_map is not None:
-                working_param = master_to_working_map[id(param)]
-            else:
-                working_param = param
-            return optimizer.param_info["param2id"][id(working_param)]
-
-        # id_map is a mapping from param ids kept by current pipeline, to their corresponding parameter objects.
-        # When Zero is used, the mapped parameter objects should be fp32 master parameters.
-        # IDs should be obtained through saved param2id mapping earlier saved in optimizer.param_info.
-        id_map = {}
-        master_to_working_map = optimizer.get_master_to_working_map()
-        for pg in optimizer.optim.param_groups:
-            for param in pg["params"]:
-                param_id = _get_param_id_from_optimizer_param(param, master_to_working_map)
-                id_map[param_id] = param
-
-        # Read checkpoint index file.
-        ckpt_index_file = CheckpointIndexFile.from_file(checkpoint_index_file)
-        ckpt_root_path = ckpt_index_file.root_path
-        weight_map = ckpt_index_file.weight_map
-        weight_map = {int(k): v for k, v in weight_map.items()}  # convert saved id from str to int
-
-        # Load param_groups
-        param_group_path = ckpt_index_file.get_param_group_filename()
-        if param_group_path is None:
-            raise RuntimeError(
-                f"Invalid index file path {checkpoint_index_file} for an optimizer. \
-                               Lacking param group file under current directory."
-            )
-        saved_groups = torch.load(param_group_path)
-
-        updated_groups = []
-        for old_pg, saved_pg in zip(optimizer.optim.param_groups, saved_groups):
-            # obtain updated param group
-            new_pg = copy.deepcopy(saved_pg)
-            new_pg["params"] = old_pg["params"]  # The parameters in the same group shouln't change.
-            updated_groups.append(new_pg)
-        # ep param groups
-        if len(optimizer.optim.param_groups) == len(saved_groups) + 1:
-            new_pg = copy.deepcopy(saved_pg)
-            new_pg["params"] = optimizer.optim.param_groups[-1]["params"]
-            updated_groups.append(new_pg)
-        optimizer.optim.__dict__.update({"param_groups": updated_groups})
-
-        # Load saved states to optimizer.
-        # Keep a record of loaded files so that file will not be repeatedly loaded.
-        loaded_file = set()
-        for pg in optimizer.optim.param_groups:
-            for param in pg["params"]:
-                if param is None:
-                    continue
-                param_id = _get_param_id_from_optimizer_param(param, master_to_working_map)
-                if param_id not in weight_map:
-                    continue
-                filename = weight_map[param_id]
-
-                # If this param's states has been loaded before, directly return.
-                if filename in loaded_file:
-                    continue
-
-                file_path = os.path.join(ckpt_root_path, filename)
-                state_dict = load_shard_state_dict(Path(file_path), use_safetensors=False)
-                load_states_into_optimizer(optimizer.optim, state_dict, id_map, strict=True)
-                loaded_file.add(filename)
-
-        # Then shard the loaded optimizer states if using tp/zero.
-        for param, state in optimizer.optim.state.items():
-            device = param.device
-            if master_to_working_map is not None:
-                working_param = master_to_working_map[id(param)]
-            else:
-                working_param = param
-            original_shape = optimizer.param_info["param2shape"][id(working_param)]
-            sharded_state = self.shard_from_complete_optimizer_state(
-                state,
-                current_shape=working_param.shape,
-                original_shape=original_shape,
-                device=device,
-                inplace=True,
-                is_moe_param=is_moe_tensor(working_param),
-            )
-            optimizer.optim.state[param] = sharded_state
-
-        sharded_optimizer_loading_epilogue(optimizer.optim)
-        if self.verbose and self.coordinator.is_master():
-            logging.info(f"The optimizer has been successfully loaded from sharded checkpoint: {ckpt_root_path}.")
-
-    def shard_from_complete_optimizer_state(
-        self,
-        state: OrderedDict,
-        current_shape: torch.Size,
-        original_shape: torch.Size,
-        device: torch.device,
-        inplace: bool,
-        is_moe_param: bool,
-    ) -> OrderedDict:
-        """
-        With complete optimizer states of a specific parameter loaded from checkpoint,
-        slice out the sharded optimizer states kept by current device.
-
-        Args:
-            state (OrderedDict): Complete optimizer states of a given parameter, loaded from checkpoint.
-            current_shape (torch.Size): The size of parameter after sharding.
-            original_shape (torch.Size): The size of parameter before sharding.
-            device (torch.device): The destination device of loaded optimizer states.
-            inplace (bool): If set to True, will update the values of argument 'state' in place. Else will make a copy of state.
-
-        Returns:
-            OrderedDict: The sharded optimizer state of the given parameter.
-        """
-        state_ = state if inplace else copy.deepcopy(state)
-
-        for k, v in state_.items():
-            if isinstance(v, torch.Tensor) and k != "step":
-                # Shard state along tensor parallel group.
-                partition_dim = search_tp_partition_dim(current_shape, original_shape, self.tp_size)
-                if partition_dim is not None:
-                    slice_size = current_shape[partition_dim]
-                    v = v.split(slice_size, dim=partition_dim)[self.tp_rank]
-
-                # Shard state along data parallel group when using Zero.
-                if self.use_zero and not is_moe_param:
-                    padding_size = (self.dp_size - v.numel() % self.dp_size) % self.dp_size
-                    with torch.no_grad():
-                        v = v.flatten()
-                        if padding_size > 0:
-                            v = torch.nn.functional.pad(v, [0, padding_size])
-                        slice_size = v.numel() // self.dp_size
-                        v = v.split(slice_size, dim=0)[self.dp_rank]
-
-                state_[k] = v.detach().clone().to(device)
-
-        return state_
-
-    def save_unsharded_model(self, model: ModelWrapper, checkpoint: str, gather_dtensor: bool, use_safetensors: bool):
-        raise NotImplementedError
-
-    def save_unsharded_optimizer(self, optimizer: OptimizerWrapper, checkpoint: str, gather_dtensor: bool):
-        raise NotImplementedError
-
-    def load_unsharded_optimizer(self, optimizer: OptimizerWrapper, checkpoint: str, strict: bool = False):
-        raise NotImplementedError
diff --git a/applications/ColossalMoE/tests/test_mixtral_layer.py b/applications/ColossalMoE/tests/test_mixtral_layer.py
index c235ae8385f5..efac386e29f7 100644
--- a/applications/ColossalMoE/tests/test_mixtral_layer.py
+++ b/applications/ColossalMoE/tests/test_mixtral_layer.py
@@ -8,7 +8,7 @@
 from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
 
 import colossalai
-from colossalai.moe import MOE_MANAGER
+from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
 from colossalai.shardformer.modeling.mixtral import EPMixtralSparseMoeBlock
 from colossalai.testing.utils import spawn
 
@@ -19,8 +19,11 @@
 
 def check_mixtral_moe_layer():
     torch.cuda.set_device(dist.get_rank())
-    MOE_MANAGER.setup(
-        parallel="EP", mode="fixed", fixed_dp_size=1, fixed_ep_size=dist.get_world_size(), fixed_pp_size=1
+    plugin = MoeHybridParallelPlugin(
+        precision="bf16",
+        tp_size=1,
+        pp_size=1,
+        ep_size=dist.get_world_size(),
     )
     config = MixtralConfig(
         hidden_size=hidden_size,
@@ -33,7 +36,7 @@ def check_mixtral_moe_layer():
     x = torch.rand(1, tokens, hidden_size, requires_grad=True).cuda()
     orig_output, orig_logits = orig_model(x)
     model = deepcopy(orig_model)
-    model = EPMixtralSparseMoeBlock.from_native_module(model)
+    model = EPMixtralSparseMoeBlock.from_native_module(model, plugin.ep_group)
     ep_output, ep_logits = model(x)
     assert_close(orig_logits, ep_logits)
     assert_close(orig_output, ep_output)
diff --git a/applications/ColossalMoE/tests/test_moe_checkpoint.py b/applications/ColossalMoE/tests/test_moe_checkpoint.py
index c6be32106a79..0d9dea80d552 100644
--- a/applications/ColossalMoE/tests/test_moe_checkpoint.py
+++ b/applications/ColossalMoE/tests/test_moe_checkpoint.py
@@ -1,9 +1,9 @@
+import shutil
 from copy import deepcopy
 
 import pytest
 import torch
 import torch.distributed as dist
-from mixtral_checkpoint import MixtralMoEHybridParallelCheckpointIO
 from torch.optim import Adam
 from transformers.models.mixtral.configuration_mixtral import MixtralConfig
 from transformers.models.mixtral.modeling_mixtral import MixtralForCausalLM
@@ -11,6 +11,9 @@
 import colossalai
 from colossalai.booster import Booster
 from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
+from colossalai.moe import MoECheckpointIO
+from colossalai.shardformer.policies.mixtral import MixtralForCausalLMPolicy
+from colossalai.tensor.moe_tensor.api import is_moe_tensor
 from colossalai.testing.utils import spawn
 
 tokens, n_experts = 7, 4
@@ -20,8 +23,14 @@
 
 def check_model_equal(model1, model2):
     assert set(model1.state_dict().keys()) == set(model2.state_dict().keys())
-    for p1, p2 in zip(model1.parameters(), model2.parameters()):
-        assert torch.equal(p1.half(), p2.half())
+    for i, ((name, p1), p2) in enumerate(zip(model1.named_parameters(), model2.parameters())):
+        if not torch.equal(p1.half(), p2.half()):
+            # exit distributed
+            print(f"Model parameter {name} is not equal. is_moe_tensor: {is_moe_tensor(p1)}")
+            raise AssertionError(f"Model parameter {name} is not equal")
+            # dist.destroy_process_group()
+            # exit(1)
+            # print(f"Passed: {name}")
 
 
 def get_optimizer_snapshot(optim):
@@ -40,7 +49,7 @@ def get_optimizer_snapshot(optim):
     }
 
 
-def check_optimizer_snapshot_equal(snapshot1, snapshot2):
+def check_optimizer_snapshot_equal(snapshot1, snapshot2, param2name, moe_dp_group=None):
     # check param_groups
     assert len(snapshot1["param_groups"]) == len(snapshot2["param_groups"])
     for group1, group2 in zip(snapshot1["param_groups"], snapshot2["param_groups"]):
@@ -51,14 +60,26 @@ def check_optimizer_snapshot_equal(snapshot1, snapshot2):
     assert set(snapshot1["state"].keys()) == set(
         snapshot2["state"].keys()
     ), f"{snapshot1['state'].keys()}, {snapshot2['state'].keys()}"
+
+    passed = True
+    count = 0
     for pid in snapshot1["state"].keys():
         state1, state2 = snapshot1["state"][pid], snapshot2["state"][pid]
         assert set(state1.keys()) == set(state2.keys())
+        bug = False
         for k in state1.keys():
             if isinstance(state1[k], torch.Tensor):
-                assert torch.equal(state1[k], state2[k]), f"{k}, {state1[k]}, {state2[k]}"
+                if not torch.equal(state1[k], state2[k]):
+                    bug = True
+                    count += 1
             else:
                 assert state1[k] == state2[k]
+        if bug:
+            passed = False
+            print(f"rank {dist.get_rank()} optim mismatch: {param2name[pid]}")
+
+    if not passed:
+        raise AssertionError(f"A total of {count} optim states are not equal")
 
 
 def check_mixtral_moe_layer():
@@ -77,10 +98,11 @@ def check_mixtral_moe_layer():
     model = deepcopy(orig_model)
     optimizer = Adam(model.parameters(), lr=1e-3)
     plugin = MoeHybridParallelPlugin(
-        tp_size=1,
         pp_size=2,
         ep_size=2,
-        checkpoint_io=MixtralMoEHybridParallelCheckpointIO,
+        tp_size=1,
+        checkpoint_io=MoECheckpointIO,
+        custom_policy=MixtralForCausalLMPolicy(),
         microbatch_size=1,
         zero_stage=1,
     )
@@ -103,9 +125,9 @@ def check_mixtral_moe_layer():
     if dist.get_rank() == 0:
         saved_model = MixtralForCausalLM.from_pretrained("mixtral_model").cuda()
         check_model_equal(orig_model, saved_model)
+        # check_model_equal(model, saved_model)
         saved_model.save_pretrained("mixtral_hf_model")
     dist.barrier()
-
     # check load model
     new_model = MixtralForCausalLM(config).cuda()
     new_optimizer = Adam(new_model.parameters(), lr=1e-3)
@@ -120,6 +142,9 @@ def check_mixtral_moe_layer():
     snapshot = get_optimizer_snapshot(optimizer.unwrap())
     booster.save_optimizer(optimizer, "mixtral_optim", shard=True)
     dist.barrier()
+
+    working2master = optimizer.get_working_to_master_map()
+    param2name = {id(working2master[id(p)]): n for n, p in model.named_parameters()}
     # reset optimizer state
     for state in optimizer.unwrap().state.values():
         for v in state.values():
@@ -127,7 +152,14 @@ def check_mixtral_moe_layer():
                 v.zero_()
     booster.load_optimizer(optimizer, "mixtral_optim")
     loaded_snapshot = get_optimizer_snapshot(optimizer.unwrap())
-    check_optimizer_snapshot_equal(snapshot, loaded_snapshot)
+    check_optimizer_snapshot_equal(snapshot, loaded_snapshot, param2name, model)
+
+    # Clean up
+    dist.barrier()
+    if dist.get_rank() == 0:
+        shutil.rmtree("mixtral_model")
+        shutil.rmtree("mixtral_hf_model")
+        shutil.rmtree("mixtral_optim")
 
 
 def run_dist(rank: int, world_size: int, port: int):
@@ -135,10 +167,11 @@ def run_dist(rank: int, world_size: int, port: int):
     check_mixtral_moe_layer()
 
 
-@pytest.mark.parametrize("world_size", [4])
+# Test EP + ZeRO + PP
+@pytest.mark.parametrize("world_size", [8])
 def test_mixtral_moe_layer(world_size: int):
     spawn(run_dist, world_size)
 
 
 if __name__ == "__main__":
-    test_mixtral_moe_layer(4)
+    test_mixtral_moe_layer(8)
diff --git a/applications/ColossalMoE/train.py b/applications/ColossalMoE/train.py
index dd24506ad0a3..1e057b4e5722 100644
--- a/applications/ColossalMoE/train.py
+++ b/applications/ColossalMoE/train.py
@@ -2,7 +2,6 @@
 
 import torch
 import torch.distributed as dist
-from mixtral_checkpoint import MixtralMoEHybridParallelCheckpointIO
 from torch.utils.data import Dataset
 from tqdm import tqdm
 from transformers import AutoTokenizer
@@ -13,8 +12,10 @@
 from colossalai.booster import Booster
 from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
 from colossalai.cluster import DistCoordinator
+from colossalai.moe.checkpoint import MoECheckpointIO
 from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
 from colossalai.nn.optimizer import HybridAdam
+from colossalai.shardformer.policies.mixtral import MixtralForCausalLMPolicy
 from colossalai.utils import get_current_device
 
 
@@ -127,13 +128,13 @@ def parse_args():
     parser.add_argument(
         "--comm_overlap",
         action="store_true",
-        help="Use communication overlap for MoE. Recommended to enable for muiti-node training.",
+        help="Use communication overlap for MoE. Recommended to enable for multi-node training.",
     )
     # hierarchical all-to-all
     parser.add_argument(
         "--hierarchical_alltoall",
         action="store_true",
-        help="Use hierarchical all-to-all for MoE. Recommended to enable for muiti-node training.",
+        help="Use hierarchical all-to-all for MoE. Recommended to enable for multi-node training.",
     )
 
     args = parser.parse_args()
@@ -154,11 +155,12 @@ def main():
             pp_size=args.pp_size,
             ep_size=args.ep_size,
             microbatch_size=args.microbatch_size,
+            custom_policy=MixtralForCausalLMPolicy(),
             enable_fused_normalization=args.use_layernorm_kernel,
             enable_jit_fused=args.use_kernel,
             precision=args.precision,
             zero_stage=args.zero_stage,
-            checkpoint_io=MixtralMoEHybridParallelCheckpointIO,
+            checkpoint_io=MoECheckpointIO,
         )
 
     else:
@@ -236,7 +238,6 @@ def main():
                         lambda x, y: x.loss,
                         optimizer,
                         return_loss=True,
-                        return_outputs=True,
                     )
                     # Backward and optimize
                     if is_pp_last_stage:
@@ -258,7 +259,15 @@ def main():
                 lr_scheduler.step()
                 optimizer.zero_grad()
 
-                # save ckeckpoint
+                # Apply load balance
+                # if (
+                #     args.load_balance
+                #     and args.load_balance_interval > 0
+                #     and (step + 1) % args.load_balance_interval == 0
+                # ):
+                #     coordinator.print_on_master(f"Apply load balance")
+                #     apply_load_balance(model, optimizer)
+                # save checkpoint
                 if (step + 1) % args.save_interval == 0:
                     coordinator.print_on_master(f"Saving model checkpoint to {args.output_path}")
                     save_checkpoint(
diff --git a/applications/ColossalQA/colossalqa/chain/retrieval_qa/base.py b/applications/ColossalQA/colossalqa/chain/retrieval_qa/base.py
index a6e87e6bea9f..80dbf47def2b 100644
--- a/applications/ColossalQA/colossalqa/chain/retrieval_qa/base.py
+++ b/applications/ColossalQA/colossalqa/chain/retrieval_qa/base.py
@@ -24,6 +24,7 @@
 from langchain.schema import BaseRetriever, Document
 from langchain.schema.language_model import BaseLanguageModel
 
+
 class CustomBaseRetrievalQA(BaseRetrievalQA):
     """Base class for question-answering chains."""
 
@@ -98,7 +99,6 @@ def _call(
             for k, v in inputs.items()
             if k in ["stop", "temperature", "top_k", "top_p", "max_new_tokens", "doc_prefix"]
         }
-        answers = []
         if self.combine_documents_chain.memory is not None:
             buffered_history_backup, summarized_history_temp_backup = copy.deepcopy(
                 self.combine_documents_chain.memory.buffered_history
@@ -117,10 +117,10 @@ def _call(
             ) = copy.deepcopy(buffered_history_backup), copy.deepcopy(summarized_history_temp_backup)
 
         # if rejection_trigger_keywords is not given, return the response from LLM directly
-        rejection_trigger_keywords = inputs.get('rejection_trigger_keywords', [])
+        rejection_trigger_keywords = inputs.get("rejection_trigger_keywords", [])
         answer = answer if all([rej not in answer for rej in rejection_trigger_keywords]) else None
-        if answer is None: 
-            answer = inputs.get('rejection_answer', "抱歉，根据提供的信息无法回答该问题。")
+        if answer is None:
+            answer = inputs.get("rejection_answer", "抱歉，根据提供的信息无法回答该问题。")
         if self.combine_documents_chain.memory is not None:
             self.combine_documents_chain.memory.save_context({"question": question}, {"output": answer})
 
@@ -161,10 +161,14 @@ async def _acall(
             input_documents=docs, question=question, callbacks=_run_manager.get_child(), **kwargs
         )
         # if rejection_trigger_keywords is not given, return the response from LLM directly
-        rejection_trigger_keywords = inputs.get('rejection_trigger_keywords', [])
-        answer = answer if all([rej not in answer for rej in rejection_trigger_keywords]) or len(rejection_trigger_keywords)==0 else None
+        rejection_trigger_keywords = inputs.get("rejection_trigger_keywords", [])
+        answer = (
+            answer
+            if all([rej not in answer for rej in rejection_trigger_keywords]) or len(rejection_trigger_keywords) == 0
+            else None
+        )
         if answer is None:
-            answer = inputs.get('rejection_answer', "抱歉，根据提供的信息无法回答该问题。")
+            answer = inputs.get("rejection_answer", "抱歉，根据提供的信息无法回答该问题。")
         self.combine_documents_chain.memory.save_context({"question": question}, {"output": answer})
 
         if self.return_source_documents:
diff --git a/applications/ColossalQA/colossalqa/data_loader/document_loader.py b/applications/ColossalQA/colossalqa/data_loader/document_loader.py
index 4ddbf2b9d249..cbcd6ad1d2d3 100644
--- a/applications/ColossalQA/colossalqa/data_loader/document_loader.py
+++ b/applications/ColossalQA/colossalqa/data_loader/document_loader.py
@@ -126,7 +126,7 @@ def load_data(self, path: str) -> None:
             else:
                 # May ba a directory, we strictly follow the glob path and will not load files in subdirectories
                 pass
-    
+
     def clear(self):
         """
         Clear loaded data.
diff --git a/applications/ColossalQA/colossalqa/data_loader/table_dataloader.py b/applications/ColossalQA/colossalqa/data_loader/table_dataloader.py
index cad48254498e..29542466fa8f 100644
--- a/applications/ColossalQA/colossalqa/data_loader/table_dataloader.py
+++ b/applications/ColossalQA/colossalqa/data_loader/table_dataloader.py
@@ -1,39 +1,40 @@
-'''
+"""
 Class for loading table type data. please refer to Pandas-Input/Output for file format details.
-'''
+"""
 
 
-import os
 import glob
+import os
+
 import pandas as pd
-from sqlalchemy import create_engine
-from colossalqa.utils import drop_table
 from colossalqa.mylogging import get_logger
+from colossalqa.utils import drop_table
+from sqlalchemy import create_engine
 
 logger = get_logger()
 
-SUPPORTED_DATA_FORMAT = ['.csv','.xlsx', '.xls','.json','.html','.h5', '.hdf5','.parquet','.feather','.dta']
+SUPPORTED_DATA_FORMAT = [".csv", ".xlsx", ".xls", ".json", ".html", ".h5", ".hdf5", ".parquet", ".feather", ".dta"]
+
 
 class TableLoader:
-    '''
+    """
     Load tables from different files and serve a sql database for database operations
-    '''
-    def __init__(self, files: str, 
-                 sql_path:str='sqlite:///mydatabase.db', 
-                 verbose=False, **kwargs) -> None:
-        '''
+    """
+
+    def __init__(self, files: str, sql_path: str = "sqlite:///mydatabase.db", verbose=False, **kwargs) -> None:
+        """
         Args:
             files: list of files (list[file path, name])
             sql_path: how to serve the sql database
-            **kwargs: keyword type arguments, useful for certain document types 
-        '''
+            **kwargs: keyword type arguments, useful for certain document types
+        """
         self.data = {}
         self.verbose = verbose
         self.sql_path = sql_path
         self.kwargs = kwargs
         self.sql_engine = create_engine(self.sql_path)
         drop_table(self.sql_engine)
-        
+
         self.sql_engine = create_engine(self.sql_path)
         for item in files:
             path = item[0]
@@ -42,68 +43,68 @@ def __init__(self, files: str,
                 raise FileNotFoundError(f"{path} doesn't exists")
             if not any([path.endswith(i) for i in SUPPORTED_DATA_FORMAT]):
                 raise TypeError(f"{path} not supported. Supported type {SUPPORTED_DATA_FORMAT}")
-            
+
             logger.info("loading data", verbose=self.verbose)
             self.load_data(path)
             logger.info("data loaded", verbose=self.verbose)
             self.to_sql(path, dataset_name)
 
     def load_data(self, path):
-        '''
+        """
         Load data and serve the data as sql database.
         Data must be in pandas format
-        '''
+        """
         files = []
         # Handle glob expression
         try:
             files = glob.glob(path)
         except Exception as e:
             logger.error(e)
-        if len(files)==0:
+        if len(files) == 0:
             raise ValueError("Unsupported file/directory format. For directories, please use glob expression")
-        elif len(files)==1:
+        elif len(files) == 1:
             path = files[0]
         else:
             for file in files:
                 self.load_data(file)
 
-        if path.endswith('.csv'):
+        if path.endswith(".csv"):
             # Load csv
             self.data[path] = pd.read_csv(path)
-        elif path.endswith('.xlsx') or path.endswith('.xls'):
+        elif path.endswith(".xlsx") or path.endswith(".xls"):
             # Load excel
             self.data[path] = pd.read_excel(path)  # You can adjust the sheet_name as needed
-        elif path.endswith('.json'):
+        elif path.endswith(".json"):
             # Load json
             self.data[path] = pd.read_json(path)
-        elif path.endswith('.html'):
+        elif path.endswith(".html"):
             # Load html
             html_tables = pd.read_html(path)
             # Choose the desired table from the list of DataFrame objects
             self.data[path] = html_tables[0]  # You may need to adjust this index
-        elif path.endswith('.h5') or path.endswith('.hdf5'):
+        elif path.endswith(".h5") or path.endswith(".hdf5"):
             # Load h5
-            self.data[path] = pd.read_hdf(path, key=self.kwargs.get('key', 'data'))  # You can adjust the key as needed
-        elif path.endswith('.parquet'):
+            self.data[path] = pd.read_hdf(path, key=self.kwargs.get("key", "data"))  # You can adjust the key as needed
+        elif path.endswith(".parquet"):
             # Load parquet
-            self.data[path] = pd.read_parquet(path, engine='fastparquet')
-        elif path.endswith('.feather'):
+            self.data[path] = pd.read_parquet(path, engine="fastparquet")
+        elif path.endswith(".feather"):
             # Load feather
             self.data[path] = pd.read_feather(path)
-        elif path.endswith('.dta'):
+        elif path.endswith(".dta"):
             # Load dta
             self.data[path] = pd.read_stata(path)
         else:
             raise ValueError("Unsupported file format")
-        
+
     def to_sql(self, path, table_name):
-        '''
+        """
         Serve the data as sql database.
-        '''
-        self.data[path].to_sql(table_name, con=self.sql_engine, if_exists='replace', index=False)
+        """
+        self.data[path].to_sql(table_name, con=self.sql_engine, if_exists="replace", index=False)
         logger.info(f"Loaded to Sqlite3\nPath: {path}", verbose=self.verbose)
         return self.sql_path
-    
+
     def get_sql_path(self):
         return self.sql_path
 
@@ -113,7 +114,3 @@ def __del__(self):
             self.sql_engine.dispose()
             del self.data
             del self.sql_engine
-
-
-
-
diff --git a/applications/ColossalQA/colossalqa/local/colossalcloud_llm.py b/applications/ColossalQA/colossalqa/local/colossalcloud_llm.py
index 62aead66c54b..14e33820d9c9 100644
--- a/applications/ColossalQA/colossalqa/local/colossalcloud_llm.py
+++ b/applications/ColossalQA/colossalqa/local/colossalcloud_llm.py
@@ -21,7 +21,7 @@
 
 """
 import json
-from typing import Any, List, Mapping, Optional
+from typing import Any, Mapping
 
 import requests
 from langchain.llms.base import LLM
@@ -31,31 +31,31 @@
 class ColossalCloudLLM(LLM):
     """
     A custom LLM class that integrates LLMs running on the ColossalCloud Platform
-    
+
     """
-    n: int 
-    gen_config: dict = None  
+
+    n: int
+    gen_config: dict = None
     auth_config: dict = None
-    valid_gen_para: list = ['max_new_tokens', 'top_k',
-                            'top_p', 'temperature', 'repetition_penalty']
+    valid_gen_para: list = ["max_new_tokens", "top_k", "top_p", "temperature", "repetition_penalty"]
 
     def __init__(self, gen_config=None, **kwargs):
         """
         Args:
             gen_config: config for generation,
                 max_new_tokens: 50 by default
-                top_k: (1, vocab_size) 
+                top_k: (1, vocab_size)
                 top_p: (0, 1) if not None
-                temperature: (0, inf) if not None 
+                temperature: (0, inf) if not None
                 repetition_penalty: (1, inf) if not None
         """
         super(ColossalCloudLLM, self).__init__(**kwargs)
-        if gen_config is None: 
-            self.gen_config = {"max_new_tokens": 50} 
-        else: 
+        if gen_config is None:
+            self.gen_config = {"max_new_tokens": 50}
+        else:
             assert "max_new_tokens" in gen_config, "max_new_tokens is a compulsory key in the gen config"
             self.gen_config = gen_config
-            
+
     @property
     def _identifying_params(self) -> Mapping[str, Any]:
         """Get the identifying parameters."""
@@ -63,17 +63,17 @@ def _identifying_params(self) -> Mapping[str, Any]:
 
     @property
     def _llm_type(self) -> str:
-        return 'ColossalCloudLLM'
-    
+        return "ColossalCloudLLM"
+
     def set_auth_config(self, **kwargs):
         url = get_from_dict_or_env(kwargs, "url", "URL")
         host = get_from_dict_or_env(kwargs, "host", "HOST")
-        
+
         auth_config = {}
-        auth_config['endpoint'] = url
-        auth_config['Host'] = host
+        auth_config["endpoint"] = url
+        auth_config["Host"] = host
         self.auth_config = auth_config
-    
+
     def _call(self, prompt: str, stop=None, **kwargs: Any) -> str:
         """
         Args:
@@ -81,15 +81,17 @@ def _call(self, prompt: str, stop=None, **kwargs: Any) -> str:
             stop: A list of strings to stop generation when encountered
 
         Returns:
-            The string generated by the model        
+            The string generated by the model
         """
         # Update the generation arguments
         for key, value in kwargs.items():
             if key not in self.valid_gen_para:
-                raise KeyError(f"Invalid generation parameter: '{key}'. Valid keys are: {', '.join(self.valid_gen_para)}")
+                raise KeyError(
+                    f"Invalid generation parameter: '{key}'. Valid keys are: {', '.join(self.valid_gen_para)}"
+                )
             if key in self.gen_config:
                 self.gen_config[key] = value
-    
+
         resp_text = self.text_completion(prompt, self.gen_config, self.auth_config)
         # TODO: This may cause excessive tokens count
         if stop is not None:
@@ -97,29 +99,19 @@ def _call(self, prompt: str, stop=None, **kwargs: Any) -> str:
                 if stopping_words in resp_text:
                     resp_text = resp_text.split(stopping_words)[0]
         return resp_text
-    
 
     def text_completion(self, prompt, gen_config, auth_config):
         # Complusory Parameters
-        endpoint = auth_config.pop('endpoint')
-        max_new_tokens = gen_config.pop('max_new_tokens')
+        endpoint = auth_config.pop("endpoint")
+        max_new_tokens = gen_config.pop("max_new_tokens")
         # Optional Parameters
-        optional_params = ['top_k', 'top_p', 'temperature', 'repetition_penalty']  # Self.optional
+        optional_params = ["top_k", "top_p", "temperature", "repetition_penalty"]  # Self.optional
         gen_config = {key: gen_config[key] for key in optional_params if key in gen_config}
         # Define the data payload
-        data = {
-            "max_new_tokens": max_new_tokens,
-            "history": [
-                {"instruction": prompt, "response": ""}
-            ],
-            **gen_config
-        }
-        headers = {
-            "Content-Type": "application/json",
-            **auth_config  # 'Host', 
-        }
+        data = {"max_new_tokens": max_new_tokens, "history": [{"instruction": prompt, "response": ""}], **gen_config}
+        headers = {"Content-Type": "application/json", **auth_config}  # 'Host',
         # Make the POST request
         response = requests.post(endpoint, headers=headers, data=json.dumps(data))
-        response.raise_for_status()   # raise error if return code is not 200(success)
+        response.raise_for_status()  # raise error if return code is not 200(success)
         # Check the response
         return response.text
diff --git a/applications/ColossalQA/colossalqa/local/llm.py b/applications/ColossalQA/colossalqa/local/llm.py
index 0aa383e9d0b9..bab702d14b13 100644
--- a/applications/ColossalQA/colossalqa/local/llm.py
+++ b/applications/ColossalQA/colossalqa/local/llm.py
@@ -193,4 +193,3 @@ def set_host_port(self, host: str = "localhost", port: int = 8077, **kwargs) ->
     def _identifying_params(self) -> Mapping[str, int]:
         """Get the identifying parameters."""
         return {"n": self.n}
-
diff --git a/applications/ColossalQA/colossalqa/prompt/prompt.py b/applications/ColossalQA/colossalqa/prompt/prompt.py
index d62249ba9c51..a8aeded3a4a4 100644
--- a/applications/ColossalQA/colossalqa/prompt/prompt.py
+++ b/applications/ColossalQA/colossalqa/prompt/prompt.py
@@ -4,7 +4,6 @@
 
 from langchain.prompts.prompt import PromptTemplate
 
-
 # Below are Chinese retrieval qa prompts
 
 _CUSTOM_SUMMARIZER_TEMPLATE_ZH = """请递进式地总结所提供的当前对话，将当前对话的摘要内容添加到先前已有的摘要上，返回一个融合了当前对话的新的摘要。
diff --git a/applications/ColossalQA/colossalqa/retriever.py b/applications/ColossalQA/colossalqa/retriever.py
index c891cb613bd6..22a75050f03b 100644
--- a/applications/ColossalQA/colossalqa/retriever.py
+++ b/applications/ColossalQA/colossalqa/retriever.py
@@ -99,13 +99,7 @@ def add_documents(
     def clear_documents(self):
         """Clear all document vectors from database"""
         for source in self.vector_stores:
-            index(
-                [],
-                self.record_managers[source],
-                self.vector_stores[source],
-                cleanup="full",
-                source_id_key="source"
-            )
+            index([], self.record_managers[source], self.vector_stores[source], cleanup="full", source_id_key="source")
         self.vector_stores = {}
         self.sql_index_database = {}
         self.record_managers = {}
diff --git a/applications/ColossalQA/data/data_sample/companies.txt b/applications/ColossalQA/data/data_sample/companies.txt
index 05c6148f18a5..4b297e6ff4fe 100644
--- a/applications/ColossalQA/data/data_sample/companies.txt
+++ b/applications/ColossalQA/data/data_sample/companies.txt
@@ -1,6 +1,6 @@
 Overview The Straits Times is the English flagship daily of SPH Media, one of the leading media companies in Asia. Launched on July 15, 1845, its comprehensive coverage of news from home and around the world makes The Straits Times the most-read newspaper in Singapore. Quality news, in-depth analyses, impactful commentaries and breaking stories are packaged to give readers riveting accounts of events in Singapore, the region, and beyond.  The most read newspaper in Singapore, both in terms of print and digital, it reaches 1.33 million people every day. The Straits Times'​ key strength is in its world class coverage of news outside Singapore. With 20 bureaus in major cities around the world, The Straits Times correspondents bring world news to readers on a Singapore platter, helping readers to appreciate world events from a Singaporean perspective.  Website http://www.straitstimes.com Phone 63196319Phone number is 63196319 Industry Newspaper Publishing Company size 1,001-5,000 employees 183 on LinkedIn  Includes members with current employer listed as The Straits Times, including part-time roles. Headquarters Singapore, Singapore Founded 1845 Specialties News and Digital media
-About With over 500 properties worldwide, Marriott Hotels has reimagined hospitality to exceed the expectations of business, group, and leisure travelers. 
+About With over 500 properties worldwide, Marriott Hotels has reimagined hospitality to exceed the expectations of business, group, and leisure travelers.
 Marriott Hotels, Marriott’s flagship brand of quality-tier, full-service hotels and resorts, provides consistent, dependable and genuinely caring experiences to guests on their terms. Marriott is a brilliant host to guests who effortlessly blend life and work, and who are inspired by how modern travel enhances them both. Our hotels offer warm, professional service; sophisticated yet functional guest room design; lobby spaces that facilitate working, dining and socializing; restaurants and bars serving international cuisine prepared simply and from the freshest ingredients; meeting and event spaces and services that are gold standard; and expansive, 24-hour fitness facilities.
-Overview AERCO International, Inc. is a recognized leader in delivering cost-effective, condensing commercial boilers, high-efficiency water heaters across a variety of markets including education, lodging, government, office buildings, healthcare, industrial and multifamily housing. AERCO's system design approach provides customer-specific solutions that deliver superior building performance at a lower operating cost while assuring uptime reliability.  When AERCO was founded in 1949, it introduced a revolutionary design for an indirect-fired water heater that heated water on demand, and without storage, at a controlled temperature. This innovation became today's standard for water heaters, maximizing the recovery of latent heat energy and significantly increasing operating efficiency.   AERCO continued to innovate and in 1988, introduced the first condensing and fully modulating boiler and water heater to the commercial market. The modulating capability of these products, still unsurpassed more than 25 years later, matches the equipment's output to real-time heating demand, ensuring the units draw no more fuel to operate than is absolutely necessary. This not only saves precious energy, but also ensures money doesn't needlessly disappear "up the stack."​  AERCO differentiates itself through a solution-based model, leveraging decades of engineering experience and industry application expertise to understand each customer’s unique needs. By partnering directly with customers and end-users to understand their project-specific requirements, AERCO provides tailored application solutions that are comprised of original product technologies including high efficiency condensing products, compact footprints, high turndown ratios, unique fuel delivery, leading control systems and proprietary design elements that combine to deliver up to 99% efficiency.   Website http://www.aerco.com Phone 845-580-8000Phone number is 845-580-8000 Industry Industrial Machinery Manufacturing Company size 51-200 employees 119 on LinkedIn  Includes members with current employer listed as AERCO International, Inc., including part-time roles. Headquarters Blauvelt, NY Founded 1949 Specialties Leading manufacturer of condensing boilers, water heating and energy recovery products and The originator of semi-instantaneous water heating 
+Overview AERCO International, Inc. is a recognized leader in delivering cost-effective, condensing commercial boilers, high-efficiency water heaters across a variety of markets including education, lodging, government, office buildings, healthcare, industrial and multifamily housing. AERCO's system design approach provides customer-specific solutions that deliver superior building performance at a lower operating cost while assuring uptime reliability.  When AERCO was founded in 1949, it introduced a revolutionary design for an indirect-fired water heater that heated water on demand, and without storage, at a controlled temperature. This innovation became today's standard for water heaters, maximizing the recovery of latent heat energy and significantly increasing operating efficiency.   AERCO continued to innovate and in 1988, introduced the first condensing and fully modulating boiler and water heater to the commercial market. The modulating capability of these products, still unsurpassed more than 25 years later, matches the equipment's output to real-time heating demand, ensuring the units draw no more fuel to operate than is absolutely necessary. This not only saves precious energy, but also ensures money doesn't needlessly disappear "up the stack."​  AERCO differentiates itself through a solution-based model, leveraging decades of engineering experience and industry application expertise to understand each customer’s unique needs. By partnering directly with customers and end-users to understand their project-specific requirements, AERCO provides tailored application solutions that are comprised of original product technologies including high efficiency condensing products, compact footprints, high turndown ratios, unique fuel delivery, leading control systems and proprietary design elements that combine to deliver up to 99% efficiency.   Website http://www.aerco.com Phone 845-580-8000Phone number is 845-580-8000 Industry Industrial Machinery Manufacturing Company size 51-200 employees 119 on LinkedIn  Includes members with current employer listed as AERCO International, Inc., including part-time roles. Headquarters Blauvelt, NY Founded 1949 Specialties Leading manufacturer of condensing boilers, water heating and energy recovery products and The originator of semi-instantaneous water heating
 Prince PLC: Overview We are a global leader of quality water solutions for residential, industrial, municipal, and commercial settings. Our family of brands offers one of the most varied product lines in the world, with world-class, water-related solutions focused on:  •	Plumbing & Flow Control •	Water Quality & Conditioning •	Water Reuse & Drainage •	HVAC •	Municipal Waterworks  Strategic Goals Watts Water is traded on the New York Stock Exchange under the symbol “WTS.” As a public company, growing shareholder value is critical. To that end, we focus on a five-part Global Strategy: Growth, Commercial Excellence, Operational Excellence, “One Watts Water,” and a Talent & Performance Culture.  Follow us on all social media platforms @WattsWater  Website http://www.watts.com/ Industry Wholesale Building Materials Company size 5,001-10,000 employees 2,248 on LinkedIn  Includes members with current employer listed as Watts Water Technologies, including part-time roles. Headquarters North Andover, MA Specialties Plumbing, HVAC, Water Quality, Gas, Conditioning, Waterworks, and Drainage
-About Courtyard Hotels is Marriott International’s largest hotel brand, with more than 1,100 hotels in over 50 countries worldwide. So, no matter where passion takes you, you’ll find us there to help you follow it. Proud members of Marriott Bonvoy.
\ No newline at end of file
+About Courtyard Hotels is Marriott International’s largest hotel brand, with more than 1,100 hotels in over 50 countries worldwide. So, no matter where passion takes you, you’ll find us there to help you follow it. Proud members of Marriott Bonvoy.
diff --git a/applications/ColossalQA/data/data_sample/companies_zh.txt b/applications/ColossalQA/data/data_sample/companies_zh.txt
index a67a93590ee8..511873bde686 100644
--- a/applications/ColossalQA/data/data_sample/companies_zh.txt
+++ b/applications/ColossalQA/data/data_sample/companies_zh.txt
@@ -3,4 +3,4 @@
 万豪酒店(Marriott Hotels)是万豪旗下优质、全方位服务酒店和度假村的旗舰品牌，为客人提供始终如一、可靠和真诚关怀的体验。万豪是一个出色的主人，客人可以轻松地将生活和工作融合在一起，并受到现代旅行如何增强两者的启发。我们的酒店提供热情、专业的服务;精致而实用的客房设计;大堂空间，方便工作、餐饮和社交;餐厅和酒吧提供简单的国际美食和最新鲜的食材;会议及活动场地及服务均属黄金标准;还有宽敞的24小时健身设施。
 AERCO International, Inc.是公认的领导者，为教育、住宿、政府、办公楼、医疗保健、工业和多户住宅等各种市场提供具有成本效益的冷凝商用锅炉和高效热水器。AERCO的系统设计方法为客户提供特定的解决方案，以较低的运营成本提供卓越的建筑性能，同时确保正常运行时间的可靠性。AERCO成立于1949年，它推出了一种革命性的设计，用于间接燃烧热水器，在控制温度下按需加热水，而无需储存。这一创新成为当今热水器的标准，最大限度地回收潜热能量，显著提高运行效率。AERCO不断创新，并于1988年向商业市场推出了第一台冷凝和全调制锅炉和热水器。这些产品的调制能力，在超过25年后仍然无与伦比，使设备的输出与实时加热需求相匹配，确保机组不会消耗更多的燃料来运行，除非绝对必要。这不仅节省了宝贵的能源，还确保了钱不会不必要地消失在“堆栈”上。AERCO通过基于解决方案的模式脱颖而出，利用数十年的工程经验和行业应用专业知识来了解每个客户的独特需求。通过与客户和最终用户直接合作，了解他们的项目具体要求，AERCO提供量身定制的应用解决方案，这些解决方案由原创产品技术组成，包括高效冷凝产品，紧凑的足迹，高降压比，独特的燃料输送，领先的控制系统和专有设计元素，结合起来可提供高达99%的效率。网址http://www.aerco.com电话845-580- 8000电话号码845-580-8000工业工业机械制造公司规模51-200名员工LinkedIn上包括当前雇主AERCO International, Inc的成员，包括兼职职位。总部成立于1949年，纽约州布劳维尔特，专长:冷凝锅炉，水加热和能源回收产品的领先制造商，半瞬时水加热的鼻祖
 Prince PLC:概述Prince PLC是为住宅、工业、市政和商业环境提供优质水解决方案的全球领导者。我们的品牌家族提供世界上最多样化的产品线之一，拥有世界级的水相关解决方案，专注于:•管道和流量控制•水质和调理•水再利用和排水•hvac•市政水务战略目标瓦茨水务在纽约证券交易所上市，代码为“WTS”。作为一家上市公司，股东价值的增长至关重要。为此，我们将重点放在五部分全球战略上:增长、卓越商业、卓越运营、“一瓦茨水”以及人才与绩效文化。在所有社交媒体平台关注我们@WattsWater网站http://www.watts.com/行业批发建材公司规模5,001-10,000名员工领英2,248名包括目前雇主为WattsWater Technologies的成员，包括兼职职位。总部北安多弗，MA专业管道，暖通空调，水质，气体，空调，自来水厂和排水
-万怡酒店是万豪国际最大的酒店品牌，在全球50多个国家拥有1100多家酒店。所以，无论你的激情带你去哪里，你都会发现我们会帮助你追随它。万豪酒店的骄傲会员。
\ No newline at end of file
+万怡酒店是万豪国际最大的酒店品牌，在全球50多个国家拥有1100多家酒店。所以，无论你的激情带你去哪里，你都会发现我们会帮助你追随它。万豪酒店的骄傲会员。
diff --git a/applications/ColossalQA/data/data_sample/csv_organization_100.csv b/applications/ColossalQA/data/data_sample/csv_organization_100.csv
index dbe97d5fd774..5d88bfb94573 100644
--- a/applications/ColossalQA/data/data_sample/csv_organization_100.csv
+++ b/applications/ColossalQA/data/data_sample/csv_organization_100.csv
@@ -98,4 +98,4 @@ Index,Organization Id,Company Name,Website,Country,Description,Founded,Industry,
 97,BA6Cd9Dae2Efd62,Good Ltd,http://duffy.com/,Anguilla,Reverse-engineered composite moratorium,1971,Consumer Services,4292
 98,E7df80C60Abd7f9,Clements-Espinoza,http://www.flowers.net/,Falkland Islands (Malvinas),Progressive modular hub,1991,Broadcast Media,236
 99,AFc285dbE2fEd24,Mendez Inc,https://www.burke.net/,Kyrgyz Republic,User-friendly exuding migration,1993,Education Management,339
-100,e9eB5A60Cef8354,Watkins-Kaiser,http://www.herring.com/,Togo,Synergistic background access,2009,Financial Services,2785
\ No newline at end of file
+100,e9eB5A60Cef8354,Watkins-Kaiser,http://www.herring.com/,Togo,Synergistic background access,2009,Financial Services,2785
diff --git a/applications/ColossalQA/data/tests/64KB.json b/applications/ColossalQA/data/tests/64KB.json
index 99278dc5c79a..41e1aeedf539 100644
--- a/applications/ColossalQA/data/tests/64KB.json
+++ b/applications/ColossalQA/data/tests/64KB.json
@@ -4,4 +4,4 @@
     {"content":"Aliquam sollicitudin ante ligula, eget malesuada nibh efficitur et. Pellentesque massa sem, scelerisque sit amet odio id, cursus tempor urna. Etiam congue dignissim volutpat. Vestibulum pharetra libero et velit gravida euismod."}
   ],
   "name":"player"
-}
\ No newline at end of file
+}
diff --git a/applications/ColossalQA/data/tests/companies.csv b/applications/ColossalQA/data/tests/companies.csv
index 93dcac9f39ae..a111992d7fcd 100644
--- a/applications/ColossalQA/data/tests/companies.csv
+++ b/applications/ColossalQA/data/tests/companies.csv
@@ -1,101 +1,101 @@
-Index,Organization Id,Name,Website,Country,Description,Founded,Industry,Number of employees
-1,FAB0d41d5b5d22c,Ferrell LLC,https://price.net/,Papua New Guinea,Horizontal empowering knowledgebase,1990,Plastics,3498
-2,6A7EdDEA9FaDC52,"Mckinney, Riley and Day",http://www.hall-buchanan.info/,Finland,User-centric system-worthy leverage,2015,Glass / Ceramics / Concrete,4952
-3,0bFED1ADAE4bcC1,Hester Ltd,http://sullivan-reed.com/,China,Switchable scalable moratorium,1971,Public Safety,5287
-4,2bFC1Be8a4ce42f,Holder-Sellers,https://becker.com/,Turkmenistan,De-engineered systemic artificial intelligence,2004,Automotive,921
-5,9eE8A6a4Eb96C24,Mayer Group,http://www.brewer.com/,Mauritius,Synchronized needs-based challenge,1991,Transportation,7870
-6,cC757116fe1C085,Henry-Thompson,http://morse.net/,Bahamas,Face-to-face well-modulated customer loyalty,1992,Primary / Secondary Education,4914
-7,219233e8aFF1BC3,Hansen-Everett,https://www.kidd.org/,Pakistan,Seamless disintermediate collaboration,2018,Publishing Industry,7832
-8,ccc93DCF81a31CD,Mcintosh-Mora,https://www.brooks.com/,Heard Island and McDonald Islands,Centralized attitude-oriented capability,1970,Import / Export,4389
-9,0B4F93aA06ED03e,Carr Inc,http://ross.com/,Kuwait,Distributed impactful customer loyalty,1996,Plastics,8167
-10,738b5aDe6B1C6A5,Gaines Inc,http://sandoval-hooper.com/,Uzbekistan,Multi-lateral scalable protocol,1997,Outsourcing / Offshoring,9698
-11,AE61b8Ffebbc476,Kidd Group,http://www.lyons.com/,Bouvet Island (Bouvetoya),Proactive foreground paradigm,2001,Primary / Secondary Education,7473
-12,eb3B7D06cCdD609,Crane-Clarke,https://www.sandoval.com/,Denmark,Front-line clear-thinking encryption,2014,Food / Beverages,9011
-13,8D0c29189C9798B,"Keller, Campos and Black",https://www.garner.info/,Liberia,Ameliorated directional emulation,2020,Museums / Institutions,2862
-14,D2c91cc03CA394c,Glover-Pope,http://www.silva.biz/,United Arab Emirates,Persevering contextually-based approach,2013,Medical Practice,9079
-15,C8AC1eaf9C036F4,Pacheco-Spears,https://aguilar.com/,Sweden,Secured logistical synergy,1984,Maritime,769
-16,b5D10A14f7a8AfE,Hodge-Ayers,http://www.archer-elliott.com/,Honduras,Future-proofed radical implementation,1990,Facilities Services,8508
-17,68139b5C4De03B4,"Bowers, Guerra and Krause",http://www.carrillo-nicholson.com/,Uganda,De-engineered transitional strategy,1972,Primary / Secondary Education,6986
-18,5c2EffEfdba2BdF,Mckenzie-Melton,http://montoya-thompson.com/,Hong Kong,Reverse-engineered heuristic alliance,1998,Investment Management / Hedge Fund / Private Equity,4589
-19,ba179F19F7925f5,Branch-Mann,http://www.lozano.com/,Botswana,Adaptive intangible frame,1999,Architecture / Planning,7961
-20,c1Ce9B350BAc66b,Weiss and Sons,https://barrett.com/,Korea,Sharable optimal functionalities,2011,Plastics,5984
-21,8de40AC4e6EaCa4,"Velez, Payne and Coffey",http://burton.com/,Luxembourg,Mandatory coherent synergy,1986,Wholesale,5010
-22,Aad86a4F0385F2d,Harrell LLC,http://www.frey-rosario.com/,Guadeloupe,Reverse-engineered mission-critical moratorium,2018,Construction,2185
-23,22aC3FFd64fD703,"Eaton, Reynolds and Vargas",http://www.freeman.biz/,Monaco,Self-enabling multi-tasking process improvement,2014,Luxury Goods / Jewelry,8987
-24,5Ec4C272bCf085c,Robbins-Cummings,http://donaldson-wilkins.com/,Belgium,Organic non-volatile hierarchy,1991,Pharmaceuticals,5038
-25,5fDBeA8BB91a000,Jenkins Inc,http://www.kirk.biz/,South Africa,Front-line systematic help-desk,2002,Insurance,1215
-26,dFfD6a6F9AC2d9C,"Greene, Benjamin and Novak",http://www.kent.net/,Romania,Centralized leadingedge moratorium,2012,Museums / Institutions,4941
-27,4B217cC5a0674C5,"Dickson, Richmond and Clay",http://everett.com/,Czech Republic,Team-oriented tangible complexity,1980,Real Estate / Mortgage,3122
-28,88b1f1cDcf59a37,Prince-David,http://thompson.com/,Christmas Island,Virtual holistic methodology,1970,Banking / Mortgage,1046
-29,f9F7bBCAEeC360F,Ayala LLC,http://www.zhang.com/,Philippines,Open-source zero administration hierarchy,2021,Legal Services,7664
-30,7Cb3AeFcE4Ba31e,Rivas Group,https://hebert.org/,Australia,Open-architected well-modulated capacity,1998,Logistics / Procurement,4155
-31,ccBcC32adcbc530,"Sloan, Mays and Whitehead",http://lawson.com/,Chad,Face-to-face high-level conglomeration,1997,Civil Engineering,365
-32,f5afd686b3d05F5,"Durham, Allen and Barnes",http://chan-stafford.org/,Zimbabwe,Synergistic web-enabled framework,1993,Mechanical or Industrial Engineering,6135
-33,38C6cfC5074Fa5e,Fritz-Franklin,http://www.lambert.com/,Nepal,Automated 4thgeneration website,1972,Hospitality,4516
-34,5Cd7efccCcba38f,Burch-Ewing,http://cline.net/,Taiwan,User-centric 4thgeneration system engine,1981,Venture Capital / VC,7443
-35,9E6Acb51e3F9d6F,"Glass, Barrera and Turner",https://dunlap.com/,Kyrgyz Republic,Multi-channeled 3rdgeneration open system,2020,Utilities,2610
-36,4D4d7E18321eaeC,Pineda-Cox,http://aguilar.org/,Bolivia,Fundamental asynchronous capability,2010,Human Resources / HR,1312
-37,485f5d06B938F2b,"Baker, Mccann and Macdonald",http://www.anderson-barker.com/,Kenya,Cross-group user-facing focus group,2013,Legislative Office,1638
-38,19E3a5Bf6dBDc4F,Cuevas-Moss,https://dodson-castaneda.net/,Guatemala,Extended human-resource intranet,1994,Music,9995
-39,6883A965c7b68F7,Hahn PLC,http://newman.com/,Belarus,Organic logistical leverage,2012,Electrical / Electronic Manufacturing,3715
-40,AC5B7AA74Aa4A2E,"Valentine, Ferguson and Kramer",http://stuart.net/,Jersey,Centralized secondary time-frame,1997,Non - Profit / Volunteering,3585
-41,decab0D5027CA6a,Arroyo Inc,https://www.turner.com/,Grenada,Managed demand-driven website,2006,Writing / Editing,9067
-42,dF084FbBb613eea,Walls LLC,http://www.reese-vasquez.biz/,Cape Verde,Self-enabling fresh-thinking installation,1989,Investment Management / Hedge Fund / Private Equity,1678
-43,A2D89Ab9bCcAd4e,"Mitchell, Warren and Schneider",https://fox.biz/,Trinidad and Tobago,Enhanced intangible time-frame,2021,Capital Markets / Hedge Fund / Private Equity,3816
-44,77aDc905434a49f,Prince PLC,https://www.watts.com/,Sweden,Profit-focused coherent installation,2016,Individual / Family Services,7645
-45,235fdEFE2cfDa5F,Brock-Blackwell,http://www.small.com/,Benin,Secured foreground emulation,1986,Online Publishing,7034
-46,1eD64cFe986BBbE,Walton-Barnett,https://ashley-schaefer.com/,Western Sahara,Right-sized clear-thinking flexibility,2001,Luxury Goods / Jewelry,1746
-47,CbBbFcdd0eaE2cF,Bartlett-Arroyo,https://cruz.com/,Northern Mariana Islands,Realigned didactic function,1976,Civic / Social Organization,3987
-48,49aECbDaE6aBD53,"Wallace, Madden and Morris",http://www.blevins-fernandez.biz/,Germany,Persistent real-time customer loyalty,2016,Pharmaceuticals,9443
-49,7b3fe6e7E72bFa4,Berg-Sparks,https://cisneros-love.com/,Canada,Stand-alone static implementation,1974,Arts / Crafts,2073
-50,c6DedA82A8aef7E,Gonzales Ltd,http://bird.com/,Tonga,Managed human-resource policy,1988,Consumer Goods,9069
-51,7D9FBF85cdC3871,Lawson and Sons,https://www.wong.com/,French Southern Territories,Compatible analyzing intranet,2021,Arts / Crafts,3527
-52,7dd18Fb7cB07b65,"Mcguire, Mcconnell and Olsen",https://melton-briggs.com/,Korea,Profound client-server frame,1988,Printing,8445
-53,EF5B55FadccB8Fe,Charles-Phillips,https://bowman.com/,Cote d'Ivoire,Monitored client-server implementation,2012,Mental Health Care,3450
-54,f8D4B99e11fAF5D,Odom Ltd,https://www.humphrey-hess.com/,Cote d'Ivoire,Advanced static process improvement,2012,Management Consulting,1825
-55,e24D21BFd3bF1E5,Richard PLC,https://holden-coleman.net/,Mayotte,Object-based optimizing model,1971,Broadcast Media,4942
-56,B9BdfEB6D3Ca44E,Sampson Ltd,https://blevins.com/,Cayman Islands,Intuitive local adapter,2005,Farming,1418
-57,2a74D6f3D3B268e,"Cherry, Le and Callahan",https://waller-delacruz.biz/,Nigeria,Universal human-resource collaboration,2017,Entertainment / Movie Production,7202
-58,Bf3F3f62c8aBC33,Cherry PLC,https://www.avila.info/,Marshall Islands,Persistent tertiary website,1980,Plastics,8245
-59,aeBe26B80a7a23c,Melton-Nichols,https://kennedy.com/,Palau,User-friendly clear-thinking productivity,2021,Legislative Office,8741
-60,aAeb29ad43886C6,Potter-Walsh,http://thomas-french.org/,Turkey,Optional non-volatile open system,2008,Human Resources / HR,6923
-61,bD1bc6bB6d1FeD3,Freeman-Chen,https://mathis.com/,Timor-Leste,Phased next generation adapter,1973,International Trade / Development,346
-62,EB9f456e8b7022a,Soto Group,https://norris.info/,Vietnam,Enterprise-wide executive installation,1988,Business Supplies / Equipment,9097
-63,Dfef38C51D8DAe3,"Poole, Cruz and Whitney",https://reed.info/,Reunion,Balanced analyzing groupware,1978,Marketing / Advertising / Sales,2992
-64,055ffEfB2Dd95B0,Riley Ltd,http://wiley.com/,Brazil,Optional exuding superstructure,1986,Textiles,9315
-65,cBfe4dbAE1699da,"Erickson, Andrews and Bailey",https://www.hobbs-grant.com/,Eritrea,Vision-oriented secondary project,2014,Consumer Electronics,7829
-66,fdFbecbadcdCdf1,"Wilkinson, Charles and Arroyo",http://hunter-mcfarland.com/,United States Virgin Islands,Assimilated 24/7 archive,1996,Building Materials,602
-67,5DCb8A5a5ca03c0,Floyd Ltd,http://www.whitney.com/,Falkland Islands (Malvinas),Function-based fault-tolerant concept,2017,Public Relations / PR,2911
-68,ce57DCbcFD6d618,Newman-Galloway,https://www.scott.com/,Luxembourg,Enhanced foreground collaboration,1987,Information Technology / IT,3934
-69,5aaD187dc929371,Frazier-Butler,https://www.daugherty-farley.info/,Northern Mariana Islands,Persistent interactive circuit,1972,Outsourcing / Offshoring,5130
-70,902D7Ac8b6d476b,Newton Inc,https://www.richmond-manning.info/,Netherlands Antilles,Fundamental stable info-mediaries,1976,Military Industry,563
-71,32BB9Ff4d939788,Duffy-Levy,https://www.potter.com/,Guernsey,Diverse exuding installation,1982,Wireless,6146
-72,adcB0afbE58bAe3,Wagner LLC,https://decker-esparza.com/,Uruguay,Reactive attitude-oriented toolset,1987,International Affairs,6874
-73,dfcA1c84AdB61Ac,Mccall-Holmes,http://www.dean.com/,Benin,Object-based value-added database,2009,Legal Services,696
-74,208044AC2fe52F3,Massey LLC,https://frazier.biz/,Suriname,Configurable zero administration Graphical User Interface,1986,Accounting,5004
-75,f3C365f0c1A0623,Hicks LLC,http://alvarez.biz/,Pakistan,Quality-focused client-server Graphical User Interface,1970,Computer Software / Engineering,8480
-76,ec5Bdd3CBAfaB93,"Cole, Russell and Avery",http://www.blankenship.com/,Mongolia,De-engineered fault-tolerant challenge,2000,Law Enforcement,7012
-77,DDB19Be7eeB56B4,Cummings-Rojas,https://simon-pearson.com/,Svalbard & Jan Mayen Islands,User-centric modular customer loyalty,2012,Financial Services,7529
-78,dd6CA3d0bc3cAfc,"Beasley, Greene and Mahoney",http://www.petersen-lawrence.com/,Togo,Extended content-based methodology,1976,Religious Institutions,869
-79,A0B9d56e61070e3,"Beasley, Sims and Allison",http://burke.info/,Latvia,Secured zero tolerance hub,1972,Facilities Services,6182
-80,cBa7EFe5D05Adaf,Crawford-Rivera,https://black-ramirez.org/,Cuba,Persevering exuding budgetary management,1999,Online Publishing,7805
-81,Ea3f6D52Ec73563,Montes-Hensley,https://krueger.org/,Liechtenstein,Multi-tiered secondary productivity,2009,Printing,8433
-82,bC0CEd48A8000E0,Velazquez-Odom,https://stokes.com/,Djibouti,Streamlined 6thgeneration function,2002,Alternative Dispute Resolution,4044
-83,c89b9b59BC4baa1,Eaton-Morales,https://www.reeves-graham.com/,Micronesia,Customer-focused explicit frame,1990,Capital Markets / Hedge Fund / Private Equity,7013
-84,FEC51bce8421a7b,"Roberson, Pennington and Palmer",http://www.keith-fisher.com/,Cameroon,Adaptive bi-directional hierarchy,1993,Telecommunications,5571
-85,e0E8e27eAc9CAd5,"George, Russo and Guerra",https://drake.com/,Sweden,Centralized non-volatile capability,1989,Military Industry,2880
-86,B97a6CF9bf5983C,Davila Inc,https://mcconnell.info/,Cocos (Keeling) Islands,Profit-focused dedicated frame,2017,Consumer Electronics,2215
-87,a0a6f9b3DbcBEb5,Mays-Preston,http://www.browning-key.com/,Mali,User-centric heuristic focus group,2006,Military Industry,5786
-88,8cC1bDa330a5871,Pineda-Morton,https://www.carr.com/,United States Virgin Islands,Grass-roots methodical info-mediaries,1991,Printing,6168
-89,ED889CB2FE9cbd3,Huang and Sons,https://www.bolton.com/,Eritrea,Re-contextualized dynamic hierarchy,1981,Semiconductors,7484
-90,F4Dc1417BC6cb8f,Gilbert-Simon,https://www.bradford.biz/,Burundi,Grass-roots radical parallelism,1973,Newspapers / Journalism,1927
-91,7ABc3c7ecA03B34,Sampson-Griffith,http://hendricks.org/,Benin,Multi-layered composite paradigm,1972,Textiles,3881
-92,4e0719FBE38e0aB,Miles-Dominguez,http://www.turner.com/,Gibraltar,Organized empowering forecast,1996,Civic / Social Organization,897
-93,dEbDAAeDfaed00A,Rowe and Sons,https://www.simpson.org/,El Salvador,Balanced multimedia knowledgebase,1978,Facilities Services,8172
-94,61BDeCfeFD0cEF5,"Valenzuela, Holmes and Rowland",https://www.dorsey.net/,Taiwan,Persistent tertiary focus group,1999,Transportation,1483
-95,4e91eD25f486110,"Best, Wade and Shepard",https://zimmerman.com/,Zimbabwe,Innovative background definition,1991,Gambling / Casinos,4873
-96,0a0bfFbBbB8eC7c,Holmes Group,https://mcdowell.org/,Ethiopia,Right-sized zero tolerance focus group,1975,Photography,2988
-97,BA6Cd9Dae2Efd62,Good Ltd,http://duffy.com/,Anguilla,Reverse-engineered composite moratorium,1971,Consumer Services,4292
-98,E7df80C60Abd7f9,Clements-Espinoza,http://www.flowers.net/,Falkland Islands (Malvinas),Progressive modular hub,1991,Broadcast Media,236
-99,AFc285dbE2fEd24,Mendez Inc,https://www.burke.net/,Kyrgyz Republic,User-friendly exuding migration,1993,Education Management,339
-100,e9eB5A60Cef8354,Watkins-Kaiser,http://www.herring.com/,Togo,Synergistic background access,2009,Financial Services,2785
+Index,Organization Id,Name,Website,Country,Description,Founded,Industry,Number of employees
+1,FAB0d41d5b5d22c,Ferrell LLC,https://price.net/,Papua New Guinea,Horizontal empowering knowledgebase,1990,Plastics,3498
+2,6A7EdDEA9FaDC52,"Mckinney, Riley and Day",http://www.hall-buchanan.info/,Finland,User-centric system-worthy leverage,2015,Glass / Ceramics / Concrete,4952
+3,0bFED1ADAE4bcC1,Hester Ltd,http://sullivan-reed.com/,China,Switchable scalable moratorium,1971,Public Safety,5287
+4,2bFC1Be8a4ce42f,Holder-Sellers,https://becker.com/,Turkmenistan,De-engineered systemic artificial intelligence,2004,Automotive,921
+5,9eE8A6a4Eb96C24,Mayer Group,http://www.brewer.com/,Mauritius,Synchronized needs-based challenge,1991,Transportation,7870
+6,cC757116fe1C085,Henry-Thompson,http://morse.net/,Bahamas,Face-to-face well-modulated customer loyalty,1992,Primary / Secondary Education,4914
+7,219233e8aFF1BC3,Hansen-Everett,https://www.kidd.org/,Pakistan,Seamless disintermediate collaboration,2018,Publishing Industry,7832
+8,ccc93DCF81a31CD,Mcintosh-Mora,https://www.brooks.com/,Heard Island and McDonald Islands,Centralized attitude-oriented capability,1970,Import / Export,4389
+9,0B4F93aA06ED03e,Carr Inc,http://ross.com/,Kuwait,Distributed impactful customer loyalty,1996,Plastics,8167
+10,738b5aDe6B1C6A5,Gaines Inc,http://sandoval-hooper.com/,Uzbekistan,Multi-lateral scalable protocol,1997,Outsourcing / Offshoring,9698
+11,AE61b8Ffebbc476,Kidd Group,http://www.lyons.com/,Bouvet Island (Bouvetoya),Proactive foreground paradigm,2001,Primary / Secondary Education,7473
+12,eb3B7D06cCdD609,Crane-Clarke,https://www.sandoval.com/,Denmark,Front-line clear-thinking encryption,2014,Food / Beverages,9011
+13,8D0c29189C9798B,"Keller, Campos and Black",https://www.garner.info/,Liberia,Ameliorated directional emulation,2020,Museums / Institutions,2862
+14,D2c91cc03CA394c,Glover-Pope,http://www.silva.biz/,United Arab Emirates,Persevering contextually-based approach,2013,Medical Practice,9079
+15,C8AC1eaf9C036F4,Pacheco-Spears,https://aguilar.com/,Sweden,Secured logistical synergy,1984,Maritime,769
+16,b5D10A14f7a8AfE,Hodge-Ayers,http://www.archer-elliott.com/,Honduras,Future-proofed radical implementation,1990,Facilities Services,8508
+17,68139b5C4De03B4,"Bowers, Guerra and Krause",http://www.carrillo-nicholson.com/,Uganda,De-engineered transitional strategy,1972,Primary / Secondary Education,6986
+18,5c2EffEfdba2BdF,Mckenzie-Melton,http://montoya-thompson.com/,Hong Kong,Reverse-engineered heuristic alliance,1998,Investment Management / Hedge Fund / Private Equity,4589
+19,ba179F19F7925f5,Branch-Mann,http://www.lozano.com/,Botswana,Adaptive intangible frame,1999,Architecture / Planning,7961
+20,c1Ce9B350BAc66b,Weiss and Sons,https://barrett.com/,Korea,Sharable optimal functionalities,2011,Plastics,5984
+21,8de40AC4e6EaCa4,"Velez, Payne and Coffey",http://burton.com/,Luxembourg,Mandatory coherent synergy,1986,Wholesale,5010
+22,Aad86a4F0385F2d,Harrell LLC,http://www.frey-rosario.com/,Guadeloupe,Reverse-engineered mission-critical moratorium,2018,Construction,2185
+23,22aC3FFd64fD703,"Eaton, Reynolds and Vargas",http://www.freeman.biz/,Monaco,Self-enabling multi-tasking process improvement,2014,Luxury Goods / Jewelry,8987
+24,5Ec4C272bCf085c,Robbins-Cummings,http://donaldson-wilkins.com/,Belgium,Organic non-volatile hierarchy,1991,Pharmaceuticals,5038
+25,5fDBeA8BB91a000,Jenkins Inc,http://www.kirk.biz/,South Africa,Front-line systematic help-desk,2002,Insurance,1215
+26,dFfD6a6F9AC2d9C,"Greene, Benjamin and Novak",http://www.kent.net/,Romania,Centralized leadingedge moratorium,2012,Museums / Institutions,4941
+27,4B217cC5a0674C5,"Dickson, Richmond and Clay",http://everett.com/,Czech Republic,Team-oriented tangible complexity,1980,Real Estate / Mortgage,3122
+28,88b1f1cDcf59a37,Prince-David,http://thompson.com/,Christmas Island,Virtual holistic methodology,1970,Banking / Mortgage,1046
+29,f9F7bBCAEeC360F,Ayala LLC,http://www.zhang.com/,Philippines,Open-source zero administration hierarchy,2021,Legal Services,7664
+30,7Cb3AeFcE4Ba31e,Rivas Group,https://hebert.org/,Australia,Open-architected well-modulated capacity,1998,Logistics / Procurement,4155
+31,ccBcC32adcbc530,"Sloan, Mays and Whitehead",http://lawson.com/,Chad,Face-to-face high-level conglomeration,1997,Civil Engineering,365
+32,f5afd686b3d05F5,"Durham, Allen and Barnes",http://chan-stafford.org/,Zimbabwe,Synergistic web-enabled framework,1993,Mechanical or Industrial Engineering,6135
+33,38C6cfC5074Fa5e,Fritz-Franklin,http://www.lambert.com/,Nepal,Automated 4thgeneration website,1972,Hospitality,4516
+34,5Cd7efccCcba38f,Burch-Ewing,http://cline.net/,Taiwan,User-centric 4thgeneration system engine,1981,Venture Capital / VC,7443
+35,9E6Acb51e3F9d6F,"Glass, Barrera and Turner",https://dunlap.com/,Kyrgyz Republic,Multi-channeled 3rdgeneration open system,2020,Utilities,2610
+36,4D4d7E18321eaeC,Pineda-Cox,http://aguilar.org/,Bolivia,Fundamental asynchronous capability,2010,Human Resources / HR,1312
+37,485f5d06B938F2b,"Baker, Mccann and Macdonald",http://www.anderson-barker.com/,Kenya,Cross-group user-facing focus group,2013,Legislative Office,1638
+38,19E3a5Bf6dBDc4F,Cuevas-Moss,https://dodson-castaneda.net/,Guatemala,Extended human-resource intranet,1994,Music,9995
+39,6883A965c7b68F7,Hahn PLC,http://newman.com/,Belarus,Organic logistical leverage,2012,Electrical / Electronic Manufacturing,3715
+40,AC5B7AA74Aa4A2E,"Valentine, Ferguson and Kramer",http://stuart.net/,Jersey,Centralized secondary time-frame,1997,Non - Profit / Volunteering,3585
+41,decab0D5027CA6a,Arroyo Inc,https://www.turner.com/,Grenada,Managed demand-driven website,2006,Writing / Editing,9067
+42,dF084FbBb613eea,Walls LLC,http://www.reese-vasquez.biz/,Cape Verde,Self-enabling fresh-thinking installation,1989,Investment Management / Hedge Fund / Private Equity,1678
+43,A2D89Ab9bCcAd4e,"Mitchell, Warren and Schneider",https://fox.biz/,Trinidad and Tobago,Enhanced intangible time-frame,2021,Capital Markets / Hedge Fund / Private Equity,3816
+44,77aDc905434a49f,Prince PLC,https://www.watts.com/,Sweden,Profit-focused coherent installation,2016,Individual / Family Services,7645
+45,235fdEFE2cfDa5F,Brock-Blackwell,http://www.small.com/,Benin,Secured foreground emulation,1986,Online Publishing,7034
+46,1eD64cFe986BBbE,Walton-Barnett,https://ashley-schaefer.com/,Western Sahara,Right-sized clear-thinking flexibility,2001,Luxury Goods / Jewelry,1746
+47,CbBbFcdd0eaE2cF,Bartlett-Arroyo,https://cruz.com/,Northern Mariana Islands,Realigned didactic function,1976,Civic / Social Organization,3987
+48,49aECbDaE6aBD53,"Wallace, Madden and Morris",http://www.blevins-fernandez.biz/,Germany,Persistent real-time customer loyalty,2016,Pharmaceuticals,9443
+49,7b3fe6e7E72bFa4,Berg-Sparks,https://cisneros-love.com/,Canada,Stand-alone static implementation,1974,Arts / Crafts,2073
+50,c6DedA82A8aef7E,Gonzales Ltd,http://bird.com/,Tonga,Managed human-resource policy,1988,Consumer Goods,9069
+51,7D9FBF85cdC3871,Lawson and Sons,https://www.wong.com/,French Southern Territories,Compatible analyzing intranet,2021,Arts / Crafts,3527
+52,7dd18Fb7cB07b65,"Mcguire, Mcconnell and Olsen",https://melton-briggs.com/,Korea,Profound client-server frame,1988,Printing,8445
+53,EF5B55FadccB8Fe,Charles-Phillips,https://bowman.com/,Cote d'Ivoire,Monitored client-server implementation,2012,Mental Health Care,3450
+54,f8D4B99e11fAF5D,Odom Ltd,https://www.humphrey-hess.com/,Cote d'Ivoire,Advanced static process improvement,2012,Management Consulting,1825
+55,e24D21BFd3bF1E5,Richard PLC,https://holden-coleman.net/,Mayotte,Object-based optimizing model,1971,Broadcast Media,4942
+56,B9BdfEB6D3Ca44E,Sampson Ltd,https://blevins.com/,Cayman Islands,Intuitive local adapter,2005,Farming,1418
+57,2a74D6f3D3B268e,"Cherry, Le and Callahan",https://waller-delacruz.biz/,Nigeria,Universal human-resource collaboration,2017,Entertainment / Movie Production,7202
+58,Bf3F3f62c8aBC33,Cherry PLC,https://www.avila.info/,Marshall Islands,Persistent tertiary website,1980,Plastics,8245
+59,aeBe26B80a7a23c,Melton-Nichols,https://kennedy.com/,Palau,User-friendly clear-thinking productivity,2021,Legislative Office,8741
+60,aAeb29ad43886C6,Potter-Walsh,http://thomas-french.org/,Turkey,Optional non-volatile open system,2008,Human Resources / HR,6923
+61,bD1bc6bB6d1FeD3,Freeman-Chen,https://mathis.com/,Timor-Leste,Phased next generation adapter,1973,International Trade / Development,346
+62,EB9f456e8b7022a,Soto Group,https://norris.info/,Vietnam,Enterprise-wide executive installation,1988,Business Supplies / Equipment,9097
+63,Dfef38C51D8DAe3,"Poole, Cruz and Whitney",https://reed.info/,Reunion,Balanced analyzing groupware,1978,Marketing / Advertising / Sales,2992
+64,055ffEfB2Dd95B0,Riley Ltd,http://wiley.com/,Brazil,Optional exuding superstructure,1986,Textiles,9315
+65,cBfe4dbAE1699da,"Erickson, Andrews and Bailey",https://www.hobbs-grant.com/,Eritrea,Vision-oriented secondary project,2014,Consumer Electronics,7829
+66,fdFbecbadcdCdf1,"Wilkinson, Charles and Arroyo",http://hunter-mcfarland.com/,United States Virgin Islands,Assimilated 24/7 archive,1996,Building Materials,602
+67,5DCb8A5a5ca03c0,Floyd Ltd,http://www.whitney.com/,Falkland Islands (Malvinas),Function-based fault-tolerant concept,2017,Public Relations / PR,2911
+68,ce57DCbcFD6d618,Newman-Galloway,https://www.scott.com/,Luxembourg,Enhanced foreground collaboration,1987,Information Technology / IT,3934
+69,5aaD187dc929371,Frazier-Butler,https://www.daugherty-farley.info/,Northern Mariana Islands,Persistent interactive circuit,1972,Outsourcing / Offshoring,5130
+70,902D7Ac8b6d476b,Newton Inc,https://www.richmond-manning.info/,Netherlands Antilles,Fundamental stable info-mediaries,1976,Military Industry,563
+71,32BB9Ff4d939788,Duffy-Levy,https://www.potter.com/,Guernsey,Diverse exuding installation,1982,Wireless,6146
+72,adcB0afbE58bAe3,Wagner LLC,https://decker-esparza.com/,Uruguay,Reactive attitude-oriented toolset,1987,International Affairs,6874
+73,dfcA1c84AdB61Ac,Mccall-Holmes,http://www.dean.com/,Benin,Object-based value-added database,2009,Legal Services,696
+74,208044AC2fe52F3,Massey LLC,https://frazier.biz/,Suriname,Configurable zero administration Graphical User Interface,1986,Accounting,5004
+75,f3C365f0c1A0623,Hicks LLC,http://alvarez.biz/,Pakistan,Quality-focused client-server Graphical User Interface,1970,Computer Software / Engineering,8480
+76,ec5Bdd3CBAfaB93,"Cole, Russell and Avery",http://www.blankenship.com/,Mongolia,De-engineered fault-tolerant challenge,2000,Law Enforcement,7012
+77,DDB19Be7eeB56B4,Cummings-Rojas,https://simon-pearson.com/,Svalbard & Jan Mayen Islands,User-centric modular customer loyalty,2012,Financial Services,7529
+78,dd6CA3d0bc3cAfc,"Beasley, Greene and Mahoney",http://www.petersen-lawrence.com/,Togo,Extended content-based methodology,1976,Religious Institutions,869
+79,A0B9d56e61070e3,"Beasley, Sims and Allison",http://burke.info/,Latvia,Secured zero tolerance hub,1972,Facilities Services,6182
+80,cBa7EFe5D05Adaf,Crawford-Rivera,https://black-ramirez.org/,Cuba,Persevering exuding budgetary management,1999,Online Publishing,7805
+81,Ea3f6D52Ec73563,Montes-Hensley,https://krueger.org/,Liechtenstein,Multi-tiered secondary productivity,2009,Printing,8433
+82,bC0CEd48A8000E0,Velazquez-Odom,https://stokes.com/,Djibouti,Streamlined 6thgeneration function,2002,Alternative Dispute Resolution,4044
+83,c89b9b59BC4baa1,Eaton-Morales,https://www.reeves-graham.com/,Micronesia,Customer-focused explicit frame,1990,Capital Markets / Hedge Fund / Private Equity,7013
+84,FEC51bce8421a7b,"Roberson, Pennington and Palmer",http://www.keith-fisher.com/,Cameroon,Adaptive bi-directional hierarchy,1993,Telecommunications,5571
+85,e0E8e27eAc9CAd5,"George, Russo and Guerra",https://drake.com/,Sweden,Centralized non-volatile capability,1989,Military Industry,2880
+86,B97a6CF9bf5983C,Davila Inc,https://mcconnell.info/,Cocos (Keeling) Islands,Profit-focused dedicated frame,2017,Consumer Electronics,2215
+87,a0a6f9b3DbcBEb5,Mays-Preston,http://www.browning-key.com/,Mali,User-centric heuristic focus group,2006,Military Industry,5786
+88,8cC1bDa330a5871,Pineda-Morton,https://www.carr.com/,United States Virgin Islands,Grass-roots methodical info-mediaries,1991,Printing,6168
+89,ED889CB2FE9cbd3,Huang and Sons,https://www.bolton.com/,Eritrea,Re-contextualized dynamic hierarchy,1981,Semiconductors,7484
+90,F4Dc1417BC6cb8f,Gilbert-Simon,https://www.bradford.biz/,Burundi,Grass-roots radical parallelism,1973,Newspapers / Journalism,1927
+91,7ABc3c7ecA03B34,Sampson-Griffith,http://hendricks.org/,Benin,Multi-layered composite paradigm,1972,Textiles,3881
+92,4e0719FBE38e0aB,Miles-Dominguez,http://www.turner.com/,Gibraltar,Organized empowering forecast,1996,Civic / Social Organization,897
+93,dEbDAAeDfaed00A,Rowe and Sons,https://www.simpson.org/,El Salvador,Balanced multimedia knowledgebase,1978,Facilities Services,8172
+94,61BDeCfeFD0cEF5,"Valenzuela, Holmes and Rowland",https://www.dorsey.net/,Taiwan,Persistent tertiary focus group,1999,Transportation,1483
+95,4e91eD25f486110,"Best, Wade and Shepard",https://zimmerman.com/,Zimbabwe,Innovative background definition,1991,Gambling / Casinos,4873
+96,0a0bfFbBbB8eC7c,Holmes Group,https://mcdowell.org/,Ethiopia,Right-sized zero tolerance focus group,1975,Photography,2988
+97,BA6Cd9Dae2Efd62,Good Ltd,http://duffy.com/,Anguilla,Reverse-engineered composite moratorium,1971,Consumer Services,4292
+98,E7df80C60Abd7f9,Clements-Espinoza,http://www.flowers.net/,Falkland Islands (Malvinas),Progressive modular hub,1991,Broadcast Media,236
+99,AFc285dbE2fEd24,Mendez Inc,https://www.burke.net/,Kyrgyz Republic,User-friendly exuding migration,1993,Education Management,339
+100,e9eB5A60Cef8354,Watkins-Kaiser,http://www.herring.com/,Togo,Synergistic background access,2009,Financial Services,2785
diff --git a/applications/ColossalQA/data/tests/test.html b/applications/ColossalQA/data/tests/test.html
index 5ad21421d827..6152ffe296fc 100644
--- a/applications/ColossalQA/data/tests/test.html
+++ b/applications/ColossalQA/data/tests/test.html
@@ -1,7 +1,7 @@
 <!DOCTYPE html>
 <!-- saved from url=(0046)https://docs.python.org/3/library/logging.html -->
 <html><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
-    
+
     <meta name="viewport" content="width=device-width, initial-scale=1.0"><meta name="generator" content="Docutils 0.17.1: http://docutils.sourceforge.net/">
 <meta property="og:title" content="logging — Logging facility for Python">
 <meta property="og:type" content="website">
@@ -16,18 +16,18 @@
 <meta name="theme-color" content="#3776ab">
 
     <title>logging — Logging facility for Python — Python 3.11.5 documentation</title><meta name="viewport" content="width=device-width, initial-scale=1.0">
-    
+
     <link rel="stylesheet" type="text/css" href="./test_files/pygments.css">
     <link rel="stylesheet" type="text/css" href="./test_files/pydoctheme.css">
     <link id="pygments_dark_css" media="(prefers-color-scheme: dark)" rel="stylesheet" type="text/css" href="./test_files/pygments_dark.css">
-    
+
     <script data-url_root="../" id="documentation_options" src="./test_files/documentation_options.js.download"></script>
     <script src="./test_files/jquery.js.download"></script>
     <script src="./test_files/underscore.js.download"></script>
     <script src="./test_files/doctools.js.download"></script>
-    
+
     <script src="./test_files/sidebar.js.download"></script>
-    
+
     <link rel="search" type="application/opensearchdescription+xml" title="Search within Python 3.11.5 documentation" href="https://docs.python.org/3/_static/opensearch.xml">
     <link rel="author" title="About these documents" href="https://docs.python.org/3/about.html">
     <link rel="index" title="Index" href="https://docs.python.org/3/genindex.html">
@@ -36,11 +36,11 @@
     <link rel="next" title="logging.config — Logging configuration" href="https://docs.python.org/3/library/logging.config.html">
     <link rel="prev" title="getopt — C-style parser for command line options" href="https://docs.python.org/3/library/getopt.html">
     <link rel="canonical" href="https://docs.python.org/3/library/logging.html">
-    
-      
-    
 
-    
+
+
+
+
     <style>
       @media only screen {
         table.full-width-table {
@@ -52,7 +52,7 @@
     <link rel="shortcut icon" type="image/png" href="./test_files/py.svg">
             <script type="text/javascript" src="./test_files/copybutton.js.download"></script>
             <script type="text/javascript" src="./test_files/menu.js.download"></script>
-            <script type="text/javascript" src="./test_files/themetoggle.js.download"></script> 
+            <script type="text/javascript" src="./test_files/themetoggle.js.download"></script>
 
   </head>
 <body data-new-gr-c-s-check-loaded="14.1038.0" data-gr-ext-installed="">
@@ -79,7 +79,7 @@
     <div class="menu-wrapper">
         <nav class="menu" role="navigation" aria-label="main navigation" tabindex="-1">
             <div class="language_switcher_placeholder"><select id="language_select"><option value="en" selected="selected">English</option><option value="es">Spanish</option><option value="fr">French</option><option value="ja">Japanese</option><option value="ko">Korean</option><option value="pt-br">Brazilian Portuguese</option><option value="tr">Turkish</option><option value="zh-cn">Simplified Chinese</option><option value="zh-tw">Traditional Chinese</option></select></div>
-            
+
 <label class="theme-selector-label">
     Theme
     <select class="theme-selector" oninput="activateTheme(this.value)">
@@ -131,7 +131,7 @@ <h3>This Page</h3>
     </div>
 </div>
 
-  
+
     <div class="related" role="navigation" aria-label="related navigation">
       <h3>Navigation</h3>
       <ul>
@@ -151,7 +151,7 @@ <h3>Navigation</h3>
             <div class="version_switcher_placeholder"><select id="version_select"><option value="3.13">dev (3.13)</option><option value="3.12">pre (3.12)</option><option value="3.11" selected="selected">3.11.5</option><option value="3.10">3.10</option><option value="3.9">3.9</option><option value="3.8">3.8</option><option value="3.7">3.7</option><option value="3.6">3.6</option><option value="3.5">3.5</option><option value="2.7">2.7</option></select></div>
           </li>
           <li>
-              
+
           </li>
     <li id="cpython-language-and-version">
       <a href="https://docs.python.org/3/index.html">3.11.5 Documentation</a> »
@@ -161,7 +161,7 @@ <h3>Navigation</h3>
           <li class="nav-item nav-item-2"><a href="https://docs.python.org/3/library/allos.html" accesskey="U">Generic Operating System Services</a> »</li>
         <li class="nav-item nav-item-this"><a href="https://docs.python.org/3/library/logging.html"><code class="xref py py-mod docutils literal notranslate"><span class="pre">logging</span></code> — Logging facility for Python</a></li>
                 <li class="right">
-                    
+
 
     <div class="inline-search" role="search">
         <form class="inline-search" action="https://docs.python.org/3/search.html" method="get">
@@ -180,15 +180,15 @@ <h3>Navigation</h3>
         <option value="dark">Dark</option>
     </select>
 </label> |</li>
-            
+
       </ul>
-    </div>    
+    </div>
 
     <div class="document">
       <div class="documentwrapper">
         <div class="bodywrapper">
           <div class="body" role="main">
-            
+
   <section id="module-logging">
 <span id="logging-logging-facility-for-python"></span><h1><a class="reference internal" href="https://docs.python.org/3/library/logging.html#module-logging" title="logging: Flexible event logging system for applications."><code class="xref py py-mod docutils literal notranslate"><span class="pre">logging</span></code></a> — Logging facility for Python<a class="headerlink" href="https://docs.python.org/3/library/logging.html#module-logging" title="Permalink to this headline">¶</a></h1>
 <p><strong>Source code:</strong> <a class="reference external" href="https://github.com/python/cpython/tree/3.11/Lib/logging/__init__.py">Lib/logging/__init__.py</a></p>
@@ -1871,7 +1871,7 @@ <h3>This Page</h3>
         </div>
       <div id="sidebarbutton"><span>«</span></div></div>
       <div class="clearer"></div>
-    </div>  
+    </div>
     <div class="related" role="navigation" aria-label="related navigation">
       <h3>Navigation</h3>
       <ul>
@@ -1891,7 +1891,7 @@ <h3>Navigation</h3>
             <div class="version_switcher_placeholder"><select id="version_select"><option value="3.13">dev (3.13)</option><option value="3.12">pre (3.12)</option><option value="3.11" selected="selected">3.11.5</option><option value="3.10">3.10</option><option value="3.9">3.9</option><option value="3.8">3.8</option><option value="3.7">3.7</option><option value="3.6">3.6</option><option value="3.5">3.5</option><option value="2.7">2.7</option></select></div>
           </li>
           <li>
-              
+
           </li>
     <li id="cpython-language-and-version">
       <a href="https://docs.python.org/3/index.html">3.11.5 Documentation</a> »
@@ -1901,7 +1901,7 @@ <h3>Navigation</h3>
           <li class="nav-item nav-item-2"><a href="https://docs.python.org/3/library/allos.html">Generic Operating System Services</a> »</li>
         <li class="nav-item nav-item-this"><a href="https://docs.python.org/3/library/logging.html"><code class="xref py py-mod docutils literal notranslate"><span class="pre">logging</span></code> — Logging facility for Python</a></li>
                 <li class="right">
-                    
+
 
     <div class="inline-search" role="search">
         <form class="inline-search" action="https://docs.python.org/3/search.html" method="get">
@@ -1920,9 +1920,9 @@ <h3>Navigation</h3>
         <option value="dark">Dark</option>
     </select>
 </label> |</li>
-            
+
       </ul>
-    </div>  
+    </div>
     <div class="footer">
     © <a href="https://docs.python.org/3/copyright.html">Copyright</a> 2001-2023, Python Software Foundation.
     <br>
@@ -1946,7 +1946,7 @@ <h3>Navigation</h3>
     </div>
 
     <script type="text/javascript" src="./test_files/switchers.js.download"></script>
-  
+
 <div id="hl-aria-live-message-container" aria-live="polite" class="visually-hidden"></div><div id="hl-aria-live-alert-container" role="alert" aria-live="assertive" class="visually-hidden"></div></body><grammarly-desktop-integration data-grammarly-shadow-root="true"><template shadowrootmode="open"><style>
       div.grammarly-desktop-integration {
         position: absolute;
@@ -1967,4 +1967,4 @@ <h3>Navigation</h3>
       div.grammarly-desktop-integration:before {
         content: attr(data-content);
       }
-    </style><div aria-label="grammarly-integration" role="group" tabindex="-1" class="grammarly-desktop-integration" data-content="{&quot;mode&quot;:&quot;limited&quot;,&quot;isActive&quot;:false,&quot;isUserDisabled&quot;:false}"></div></template></grammarly-desktop-integration></html>
\ No newline at end of file
+    </style><div aria-label="grammarly-integration" role="group" tabindex="-1" class="grammarly-desktop-integration" data-content="{&quot;mode&quot;:&quot;limited&quot;,&quot;isActive&quot;:false,&quot;isUserDisabled&quot;:false}"></div></template></grammarly-desktop-integration></html>
diff --git a/applications/ColossalQA/data/tests/test.md b/applications/ColossalQA/data/tests/test.md
index 20d3c612fef5..b6e94792c83d 100644
--- a/applications/ColossalQA/data/tests/test.md
+++ b/applications/ColossalQA/data/tests/test.md
@@ -34,9 +34,9 @@ python api_server.py --host localhost --port $PORT_NUMBER --model $PATH_TO_MODEL
 ### Collect your data
 
 For ChatGPT based Agent we support document retrieval and simple sql search.
-If you want to run the demo locally, we provided document retrieval based conversation system built upon langchain. It accept a wide range of documents. 
+If you want to run the demo locally, we provided document retrieval based conversation system built upon langchain. It accept a wide range of documents.
 
-Read comments under ./colossalqa/data_loader for more detail 
+Read comments under ./colossalqa/data_loader for more detail
 
 ### Serving
 Currently use vllm will replace with colossal inference when ready. Please refer class VllmLLM.
diff --git a/applications/ColossalQA/data/tests/test.txt b/applications/ColossalQA/data/tests/test.txt
index ff5bf2dc7742..e608c772e06e 100644
--- a/applications/ColossalQA/data/tests/test.txt
+++ b/applications/ColossalQA/data/tests/test.txt
@@ -1,38 +1,38 @@
-﻿Your Name
-Lorem ipsum dolor sit amet, consectetuer adipiscing elit
-	123 Your Street
-Your City, ST 12345
-(123) 456-7890
-no_reply@example.com
-	EXPERIENCE
-Company, Location — Job Title
-MONTH 20XX - PRESENT
-Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh.
-Company, Location — Job Title
-MONTH 20XX - MONTH 20XX
-Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh.
-Company, Location — Job Title
-MONTH 20XX - MONTH 20XX
-Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh.
-EDUCATION
-School Name, Location — Degree
-MONTH 20XX - MONTH 20XX
-Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore.
-School Name, Location — Degree
-MONTH 20XX - MONTH 20XX
-Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam.
-PROJECTS
-Project Name — Detail
-Lorem ipsum dolor sit amet, consectetuer adipiscing elit.
-	SKILLS
-* Lorem ipsum dolor sit amet.
-* Consectetuer adipiscing elit.
-* Sed diam nonummy nibh euismod tincidunt.
-* L​​​‌​aoreet dolore magna aliquam erat volutpat.
-AWARDS
-Lorem ipsum dolor sit amet Consectetuer adipiscing elit, Sed diam nonummy
-Nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat.
-Lorem ipsum dolor sit amet Consectetuer adipiscing elit, Sed diam nonummy
-Nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat.
-LANGUAGES
-Lorem ipsum, Dolor sit amet, Consectetuer
\ No newline at end of file
+﻿Your Name
+Lorem ipsum dolor sit amet, consectetuer adipiscing elit
+	123 Your Street
+Your City, ST 12345
+(123) 456-7890
+no_reply@example.com
+	EXPERIENCE
+Company, Location — Job Title
+MONTH 20XX - PRESENT
+Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh.
+Company, Location — Job Title
+MONTH 20XX - MONTH 20XX
+Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh.
+Company, Location — Job Title
+MONTH 20XX - MONTH 20XX
+Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh.
+EDUCATION
+School Name, Location — Degree
+MONTH 20XX - MONTH 20XX
+Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore.
+School Name, Location — Degree
+MONTH 20XX - MONTH 20XX
+Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam.
+PROJECTS
+Project Name — Detail
+Lorem ipsum dolor sit amet, consectetuer adipiscing elit.
+	SKILLS
+* Lorem ipsum dolor sit amet.
+* Consectetuer adipiscing elit.
+* Sed diam nonummy nibh euismod tincidunt.
+* L​​​‌​aoreet dolore magna aliquam erat volutpat.
+AWARDS
+Lorem ipsum dolor sit amet Consectetuer adipiscing elit, Sed diam nonummy
+Nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat.
+Lorem ipsum dolor sit amet Consectetuer adipiscing elit, Sed diam nonummy
+Nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat.
+LANGUAGES
+Lorem ipsum, Dolor sit amet, Consectetuer
diff --git a/applications/ColossalQA/examples/retrieval_conversation_universal.py b/applications/ColossalQA/examples/retrieval_conversation_universal.py
index 5d13a63c3fad..8999fbabd2ed 100644
--- a/applications/ColossalQA/examples/retrieval_conversation_universal.py
+++ b/applications/ColossalQA/examples/retrieval_conversation_universal.py
@@ -1,22 +1,27 @@
 import argparse
+
 from colossalqa.retrieval_conversation_universal import UniversalRetrievalConversation
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     # Parse arguments
     parser = argparse.ArgumentParser()
-    parser.add_argument('--en_model_path', type=str, default=None)
-    parser.add_argument('--zh_model_path', type=str, default=None)
-    parser.add_argument('--zh_model_name', type=str, default=None)
-    parser.add_argument('--en_model_name', type=str, default=None)
-    parser.add_argument('--sql_file_path', type=str, default=None, help='path to the a empty folder for storing sql files for indexing')
+    parser.add_argument("--en_model_path", type=str, default=None)
+    parser.add_argument("--zh_model_path", type=str, default=None)
+    parser.add_argument("--zh_model_name", type=str, default=None)
+    parser.add_argument("--en_model_name", type=str, default=None)
+    parser.add_argument(
+        "--sql_file_path", type=str, default=None, help="path to the a empty folder for storing sql files for indexing"
+    )
     args = parser.parse_args()
-    
+
     # Will ask for documents path in running time
-    session = UniversalRetrievalConversation(files_en=None, 
-                files_zh=None, 
-                zh_model_path=args.zh_model_path, en_model_path=args.en_model_path,
-                zh_model_name=args.zh_model_name, en_model_name=args.en_model_name,
-                sql_file_path=args.sql_file_path
-                )
+    session = UniversalRetrievalConversation(
+        files_en=None,
+        files_zh=None,
+        zh_model_path=args.zh_model_path,
+        en_model_path=args.en_model_path,
+        zh_model_name=args.zh_model_name,
+        en_model_name=args.en_model_name,
+        sql_file_path=args.sql_file_path,
+    )
     session.start_test_session()
-        
\ No newline at end of file
diff --git a/applications/ColossalQA/examples/webui_demo/RAG_ChatBot.py b/applications/ColossalQA/examples/webui_demo/RAG_ChatBot.py
index 526328dda11b..47897b538ee2 100644
--- a/applications/ColossalQA/examples/webui_demo/RAG_ChatBot.py
+++ b/applications/ColossalQA/examples/webui_demo/RAG_ChatBot.py
@@ -5,13 +5,7 @@
 from colossalqa.data_loader.document_loader import DocumentLoader
 from colossalqa.memory import ConversationBufferWithSummary
 from colossalqa.mylogging import get_logger
-from colossalqa.prompt.prompt import (
-    PROMPT_DISAMBIGUATE_ZH,
-    PROMPT_RETRIEVAL_QA_ZH,
-    SUMMARY_PROMPT_ZH,
-    ZH_RETRIEVAL_QA_REJECTION_ANSWER,
-    ZH_RETRIEVAL_QA_TRIGGER_KEYWORDS,
-)
+from colossalqa.prompt.prompt import ZH_RETRIEVAL_QA_REJECTION_ANSWER, ZH_RETRIEVAL_QA_TRIGGER_KEYWORDS
 from colossalqa.retriever import CustomRetriever
 from langchain import LLMChain
 from langchain.embeddings import HuggingFaceEmbeddings
@@ -116,13 +110,13 @@ def split_docs_and_add_to_mem(self, **kwargs):
     def split_docs(self, documents):
         doc_splits = self.text_splitter.split_documents(documents)
         return doc_splits
-    
+
     def clear_docs(self, **kwargs):
         self.documents = []
         self.docs_names = []
         self.info_retriever.clear_documents()
         self.memory.initiate_document_retrieval_chain(self.llm, kwargs["gen_qa_prompt"], self.info_retriever)
-        
+
     def reset_config(self, rag_config):
         self.rag_config = rag_config
         self.set_embed_model(**self.rag_config["embed"])
diff --git a/applications/ColossalQA/examples/webui_demo/README.md b/applications/ColossalQA/examples/webui_demo/README.md
index 3e1a8adf53f3..61071a1c8065 100644
--- a/applications/ColossalQA/examples/webui_demo/README.md
+++ b/applications/ColossalQA/examples/webui_demo/README.md
@@ -115,4 +115,4 @@ python webui.py --http_host "your-backend-api-host" --http_port "your-backend-ap
 
 After launching the script, you can upload files and engage with the chatbot through your web browser.
 
-![ColossalQA Demo](https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/colossalqa/new_ui.png)
\ No newline at end of file
+![ColossalQA Demo](https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/colossalqa/new_ui.png)
diff --git a/applications/ColossalQA/examples/webui_demo/config.py b/applications/ColossalQA/examples/webui_demo/config.py
index ef90fab62589..5bdc08a09e50 100644
--- a/applications/ColossalQA/examples/webui_demo/config.py
+++ b/applications/ColossalQA/examples/webui_demo/config.py
@@ -1,58 +1,30 @@
-from colossalqa.prompt.prompt import (
-    PROMPT_DISAMBIGUATE_ZH,
-    PROMPT_RETRIEVAL_QA_ZH,
-    SUMMARY_PROMPT_ZH,
-    ZH_RETRIEVAL_QA_REJECTION_ANSWER,
-    ZH_RETRIEVAL_QA_TRIGGER_KEYWORDS,
-)
+from colossalqa.prompt.prompt import PROMPT_DISAMBIGUATE_ZH, PROMPT_RETRIEVAL_QA_ZH, SUMMARY_PROMPT_ZH
 from colossalqa.text_splitter import ChineseTextSplitter
 
 ALL_CONFIG = {
     "embed": {
         "embed_name": "m3e",  # embedding model name
         "embed_model_name_or_path": "moka-ai/m3e-base",  # path to embedding model, could be a local path or a huggingface path
-        "embed_model_device": {
-            "device": "cpu"
-        }
+        "embed_model_device": {"device": "cpu"},
     },
     "model": {
         "mode": "api",  # "local" for loading models, "api" for using model api
         "model_name": "chatgpt_api",  # local model name, "chatgpt_api" or "pangu_api"
-        "model_path": "", # path to the model, could be a local path or a huggingface path. don't need if using an api
-        "device": {
-            "device": "cuda"
-        }
-    },
-    "splitter": {
-        "name": ChineseTextSplitter
-    },
-    "retrieval": {
-        "retri_top_k": 3,
-        "retri_kb_file_path": "./", # path to store database files
-        "verbose": True
+        "model_path": "",  # path to the model, could be a local path or a huggingface path. don't need if using an api
+        "device": {"device": "cuda"},
     },
+    "splitter": {"name": ChineseTextSplitter},
+    "retrieval": {"retri_top_k": 3, "retri_kb_file_path": "./", "verbose": True},  # path to store database files
     "chain": {
         "mem_summary_prompt": SUMMARY_PROMPT_ZH,  # summary prompt template
         "mem_human_prefix": "用户",
         "mem_ai_prefix": "Assistant",
         "mem_max_tokens": 2000,
-        "mem_llm_kwargs": {
-            "max_new_tokens": 50,
-            "temperature": 1,
-            "do_sample": True
-        },
+        "mem_llm_kwargs": {"max_new_tokens": 50, "temperature": 1, "do_sample": True},
         "disambig_prompt": PROMPT_DISAMBIGUATE_ZH,  # disambiguate prompt template
-        "disambig_llm_kwargs": {
-            "max_new_tokens": 30,
-            "temperature": 1,
-            "do_sample": True
-        },
-        "gen_llm_kwargs": {
-            "max_new_tokens": 100,
-            "temperature": 1,
-            "do_sample": True
-        },
+        "disambig_llm_kwargs": {"max_new_tokens": 30, "temperature": 1, "do_sample": True},
+        "gen_llm_kwargs": {"max_new_tokens": 100, "temperature": 1, "do_sample": True},
         "gen_qa_prompt": PROMPT_RETRIEVAL_QA_ZH,  # generation prompt template
-        "verbose": True    
-    }   
-}
\ No newline at end of file
+        "verbose": True,
+    },
+}
diff --git a/applications/ColossalQA/examples/webui_demo/server.py b/applications/ColossalQA/examples/webui_demo/server.py
index 3b0f82845c87..1f699421d34c 100644
--- a/applications/ColossalQA/examples/webui_demo/server.py
+++ b/applications/ColossalQA/examples/webui_demo/server.py
@@ -1,27 +1,18 @@
 import argparse
-import os
 from typing import List, Union
 
-
+import config
+import uvicorn
 from colossalqa.local.llm import ColossalAPI, ColossalLLM
-from colossalqa.data_loader.document_loader import DocumentLoader
 from colossalqa.mylogging import get_logger
-from colossalqa.retrieval_conversation_zh import ChineseRetrievalConversation
-from colossalqa.retriever import CustomRetriever
-from enum import Enum
 from fastapi import FastAPI, Request
-from langchain.embeddings import HuggingFaceEmbeddings
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from pydantic import BaseModel, Field
-import uvicorn
-
-import config
+from pydantic import BaseModel
 from RAG_ChatBot import RAG_ChatBot
 from utils import DocAction
 
-
 logger = get_logger()
 
+
 def parseArgs():
     parser = argparse.ArgumentParser()
     parser.add_argument("--http_host", default="0.0.0.0")
@@ -36,6 +27,7 @@ class DocUpdateReq(BaseModel):
     doc_files: Union[List[str], str, None] = None
     action: DocAction = DocAction.ADD
 
+
 class GenerationTaskReq(BaseModel):
     user_input: str
 
@@ -45,7 +37,7 @@ def update_docs(data: DocUpdateReq, request: Request):
     if data.action == "add":
         if isinstance(data.doc_files, str):
             data.doc_files = [data.doc_files]
-        chatbot.load_doc_from_files(files = data.doc_files)
+        chatbot.load_doc_from_files(files=data.doc_files)
         all_docs = ""
         for doc in chatbot.docs_names:
             all_docs += f"\t{doc}\n\n"
@@ -79,17 +71,18 @@ def generate(data: GenerationTaskReq, request: Request):
     elif all_config["model"]["mode"] == "api":
         if model_name == "pangu_api":
             from colossalqa.local.pangu_llm import Pangu
-            
+
             gen_config = {
                 "user": "User",
                 "max_tokens": all_config["chain"]["disambig_llm_kwargs"]["max_new_tokens"],
                 "temperature": all_config["chain"]["disambig_llm_kwargs"]["temperature"],
-                "n": 1   # the number of responses generated
+                "n": 1,  # the number of responses generated
             }
             llm = Pangu(gen_config=gen_config)
             llm.set_auth_config()  # verify user's auth info here
         elif model_name == "chatgpt_api":
             from langchain.llms import OpenAI
+
             llm = OpenAI()
     else:
         raise ValueError("Unsupported mode.")
diff --git a/applications/ColossalQA/examples/webui_demo/webui.py b/applications/ColossalQA/examples/webui_demo/webui.py
index cd3b5fd5da4b..1e34330615b5 100644
--- a/applications/ColossalQA/examples/webui_demo/webui.py
+++ b/applications/ColossalQA/examples/webui_demo/webui.py
@@ -1,24 +1,26 @@
 import argparse
 import json
 import os
-import requests
 
 import gradio as gr
-
+import requests
 from utils import DocAction
 
+
 def parseArgs():
     parser = argparse.ArgumentParser()
     parser.add_argument("--http_host", default="0.0.0.0")
     parser.add_argument("--http_port", type=int, default=13666)
     return parser.parse_args()
 
+
 def get_response(data, url):
     headers = {"Content-type": "application/json"}
     response = requests.post(url, json=data, headers=headers)
     response = json.loads(response.content)
     return response
 
+
 def add_text(history, text):
     history = history + [(text, None)]
     return history, gr.update(value=None, interactive=True)
@@ -28,35 +30,28 @@ def add_file(history, files):
     files_string = "\n".join([os.path.basename(file.name) for file in files])
 
     doc_files = [file.name for file in files]
-    data = {
-        "doc_files": doc_files,
-        "action": DocAction.ADD
-    }
+    data = {"doc_files": doc_files, "action": DocAction.ADD}
     response = get_response(data, update_url)["response"]
     history = history + [(files_string, response)]
     return history
 
-def bot(history):    
-    data = {
-        "user_input": history[-1][0].strip()
-    }
+
+def bot(history):
+    data = {"user_input": history[-1][0].strip()}
     response = get_response(data, gen_url)
 
     if response["error"] != "":
         raise gr.Error(response["error"])
-    
+
     history[-1][1] = response["response"]
     yield history
 
 
 def restart(chatbot, txt):
     # Reset the conversation state and clear the chat history
-    data = {
-        "doc_files": "",
-        "action": DocAction.CLEAR
-    }
-    response = get_response(data, update_url)
-    
+    data = {"doc_files": "", "action": DocAction.CLEAR}
+    get_response(data, update_url)
+
     return gr.update(value=None), gr.update(value=None, interactive=True)
 
 
@@ -97,7 +92,7 @@ def restart(chatbot, txt):
 
     txt_msg = txt.submit(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(bot, chatbot, chatbot)
     # Clear the original textbox
-    txt_msg.then(lambda: gr.update(value=None, interactive=True), None, [txt], queue=False) 
+    txt_msg.then(lambda: gr.update(value=None, interactive=True), None, [txt], queue=False)
     # Click Upload Button: 1. upload files  2. send config to backend, initalize model 3. get response "conversation_ready" = True/False
     file_msg = btn.upload(add_file, [chatbot, btn], [chatbot], queue=False)
 
diff --git a/applications/ColossalQA/pytest.ini b/applications/ColossalQA/pytest.ini
index 9e84349f2285..8bd9250769e3 100644
--- a/applications/ColossalQA/pytest.ini
+++ b/applications/ColossalQA/pytest.ini
@@ -1,4 +1,4 @@
 [pytest]
 markers =
     dist: tests which are run in a multi-GPU or multi-machine environment (at least 4 GPUs)
-    largedist: tests which are run in a multi-GPU or multi-machine environment (at least 8 GPUs)
\ No newline at end of file
+    largedist: tests which are run in a multi-GPU or multi-machine environment (at least 8 GPUs)
diff --git a/applications/ColossalQA/tests/test_document_loader.py b/applications/ColossalQA/tests/test_document_loader.py
index 163b0d2cca22..96c74f128782 100644
--- a/applications/ColossalQA/tests/test_document_loader.py
+++ b/applications/ColossalQA/tests/test_document_loader.py
@@ -1,21 +1,21 @@
 import os
+
 from colossalqa.data_loader.document_loader import DocumentLoader
 
 
 def test_add_document():
-    PATH = os.environ.get('TEST_DOCUMENT_LOADER_DATA_PATH')
-    files = [[PATH, 'all data']]
+    PATH = os.environ.get("TEST_DOCUMENT_LOADER_DATA_PATH")
+    files = [[PATH, "all data"]]
     document_loader = DocumentLoader(files)
     documents = document_loader.all_data
     all_files = []
     for doc in documents:
-        assert isinstance(doc.page_content, str)==True
-        if doc.metadata['source'] not in all_files:
-            all_files.append(doc.metadata['source'])
+        assert isinstance(doc.page_content, str) == True
+        if doc.metadata["source"] not in all_files:
+            all_files.append(doc.metadata["source"])
     print(all_files)
     assert len(all_files) == 6
 
 
-if __name__=='__main__':
+if __name__ == "__main__":
     test_add_document()
-
diff --git a/applications/ColossalQA/tests/test_retrieval_qa.py b/applications/ColossalQA/tests/test_retrieval_qa.py
index 76867b562e8f..8ba84cc7309f 100644
--- a/applications/ColossalQA/tests/test_retrieval_qa.py
+++ b/applications/ColossalQA/tests/test_retrieval_qa.py
@@ -4,56 +4,44 @@
 
 
 def test_en_retrievalQA():
-    data_path_en = os.environ.get('TEST_DATA_PATH_EN')
-    data_path_zh = os.environ.get('TEST_DATA_PATH_ZH')
-    en_model_path = os.environ.get('EN_MODEL_PATH')
-    zh_model_path = os.environ.get('ZH_MODEL_PATH')
-    zh_model_name = os.environ.get('ZH_MODEL_NAME')
-    en_model_name = os.environ.get('EN_MODEL_NAME')
-    sql_file_path = os.environ.get('SQL_FILE_PATH')
-    qa_session = UniversalRetrievalConversation(files_en=[{
-        'data_path': data_path_en,
-        'name': 'company information',
-        'separator': '\n'
-    }],
-                                                files_zh=[{
-                                                    'data_path': data_path_zh,
-                                                    'name': 'company information',
-                                                    'separator': '\n'
-                                                }],
-                                                zh_model_path=zh_model_path,
-                                                en_model_path=en_model_path,
-                                                zh_model_name=zh_model_name,
-                                                en_model_name=en_model_name,
-                                                sql_file_path=sql_file_path)
-    ans = qa_session.run("which company runs business in hotel industry?", which_language='en')
+    data_path_en = os.environ.get("TEST_DATA_PATH_EN")
+    data_path_zh = os.environ.get("TEST_DATA_PATH_ZH")
+    en_model_path = os.environ.get("EN_MODEL_PATH")
+    zh_model_path = os.environ.get("ZH_MODEL_PATH")
+    zh_model_name = os.environ.get("ZH_MODEL_NAME")
+    en_model_name = os.environ.get("EN_MODEL_NAME")
+    sql_file_path = os.environ.get("SQL_FILE_PATH")
+    qa_session = UniversalRetrievalConversation(
+        files_en=[{"data_path": data_path_en, "name": "company information", "separator": "\n"}],
+        files_zh=[{"data_path": data_path_zh, "name": "company information", "separator": "\n"}],
+        zh_model_path=zh_model_path,
+        en_model_path=en_model_path,
+        zh_model_name=zh_model_name,
+        en_model_name=en_model_name,
+        sql_file_path=sql_file_path,
+    )
+    ans = qa_session.run("which company runs business in hotel industry?", which_language="en")
     print(ans)
 
 
 def test_zh_retrievalQA():
-    data_path_en = os.environ.get('TEST_DATA_PATH_EN')
-    data_path_zh = os.environ.get('TEST_DATA_PATH_ZH')
-    en_model_path = os.environ.get('EN_MODEL_PATH')
-    zh_model_path = os.environ.get('ZH_MODEL_PATH')
-    zh_model_name = os.environ.get('ZH_MODEL_NAME')
-    en_model_name = os.environ.get('EN_MODEL_NAME')
-    sql_file_path = os.environ.get('SQL_FILE_PATH')
-    qa_session = UniversalRetrievalConversation(files_en=[{
-        'data_path': data_path_en,
-        'name': 'company information',
-        'separator': '\n'
-    }],
-                                                files_zh=[{
-                                                    'data_path': data_path_zh,
-                                                    'name': 'company information',
-                                                    'separator': '\n'
-                                                }],
-                                                zh_model_path=zh_model_path,
-                                                en_model_path=en_model_path,
-                                                zh_model_name=zh_model_name,
-                                                en_model_name=en_model_name,
-                                                sql_file_path=sql_file_path)
-    ans = qa_session.run("哪家公司在经营酒店业务？", which_language='zh')
+    data_path_en = os.environ.get("TEST_DATA_PATH_EN")
+    data_path_zh = os.environ.get("TEST_DATA_PATH_ZH")
+    en_model_path = os.environ.get("EN_MODEL_PATH")
+    zh_model_path = os.environ.get("ZH_MODEL_PATH")
+    zh_model_name = os.environ.get("ZH_MODEL_NAME")
+    en_model_name = os.environ.get("EN_MODEL_NAME")
+    sql_file_path = os.environ.get("SQL_FILE_PATH")
+    qa_session = UniversalRetrievalConversation(
+        files_en=[{"data_path": data_path_en, "name": "company information", "separator": "\n"}],
+        files_zh=[{"data_path": data_path_zh, "name": "company information", "separator": "\n"}],
+        zh_model_path=zh_model_path,
+        en_model_path=en_model_path,
+        zh_model_name=zh_model_name,
+        en_model_name=en_model_name,
+        sql_file_path=sql_file_path,
+    )
+    ans = qa_session.run("哪家公司在经营酒店业务？", which_language="zh")
     print(ans)
 
 
diff --git a/applications/ColossalQA/version.txt b/applications/ColossalQA/version.txt
index 8a9ecc2ea99d..8acdd82b765e 100644
--- a/applications/ColossalQA/version.txt
+++ b/applications/ColossalQA/version.txt
@@ -1 +1 @@
-0.0.1
\ No newline at end of file
+0.0.1
diff --git a/colossalai/__init__.py b/colossalai/__init__.py
index 6b7f5d055207..beef0561ccfc 100644
--- a/colossalai/__init__.py
+++ b/colossalai/__init__.py
@@ -1,5 +1,5 @@
-from .initialize import launch, launch_from_openmpi, launch_from_slurm, launch_from_torch
 from . import accelerator
+from .initialize import launch, launch_from_openmpi, launch_from_slurm, launch_from_torch
 
 try:
     # .version will be created by setup.py
diff --git a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
index ae372dd034e0..36e36ffd097b 100644
--- a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
+++ b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py
@@ -1,4 +1,5 @@
 import random
+import warnings
 from types import MethodType
 from typing import Callable, Optional, OrderedDict, Tuple
 
@@ -22,14 +23,14 @@
 )
 from colossalai.cluster import ProcessGroupMesh
 from colossalai.interface import ModelWrapper, OptimizerWrapper
-from colossalai.moe import MOE_MANAGER, MoECheckpointIO
+from colossalai.moe import MoECheckpointIO
 from colossalai.pipeline.schedule import OneForwardOneBackwardSchedule
 from colossalai.pipeline.stage_manager import PipelineStageManager
 from colossalai.shardformer import ShardConfig
 from colossalai.shardformer.policies.base_policy import Policy
 from colossalai.zero.low_level import LowLevelZeroOptimizer
 
-PP_AXIS, DP_AXIS, TP_AXIS = 0, 1, 2
+PP_AXIS, DP_AXIS, EP_AXIS, TP_AXIS = 0, 1, 2, -1
 
 
 class HybridParallelZeroOptimizer(LowLevelZeroOptimizer):
@@ -107,8 +108,8 @@ class MoeHybridParallelPlugin(HybridParallelPlugin):
         >>> model, optimizer, criterion, train_dataloader, _ = booster.boost(model, optimizer, criterion, train_dataloader)
 
     Args:
-        tp_size (int): The size of tensor parallelism. Tensor parallelism will not be used when tp_size is set to 1.
         pp_size (int): The number of pipeline stages in pipeline parallelism. Pipeline parallelism will not be used when pp_size is set to 1.
+        tp_size (int): The size of tensor parallelism. Tensor parallelism will not be used when tp_size is set to 1.
         precision (str, optional): Specifies the precision of parameters during training.
                                     Auto-mixied precision will be used when this argument is set to 'fp16' or 'bf16', otherwise model is trained with 'fp32'.
                                     Defaults to 'fp16'.
@@ -144,14 +145,14 @@ class MoeHybridParallelPlugin(HybridParallelPlugin):
         cpu_offload (bool, optional): Whether to open cpu_offload when using ZeRO. Defaults to False.
         communication_dtype (torch.dtype, optional): Communication dtype when using ZeRO. If not specified, the dtype of param will be used. Defaults to None.
         overlap_communication (bool, optional): Whether to overlap communication and computation when using ZeRO. Defaults to True.
+        use_ep_inside (bool, Optional): Whether to use ep inside dp (intra-node) for moe params.
     """
 
     def __init__(
         self,
-        tp_size: int,
         pp_size: int,
         ep_size: int,
-        extra_dp_size: int = 1,
+        tp_size: int = 1,
         precision: str = "fp16",
         zero_stage: int = 0,
         enable_all_optimization: bool = False,
@@ -184,32 +185,25 @@ def __init__(
         custom_policy: Policy = None,
         checkpoint_io: Optional[MoECheckpointIO] = None,
     ) -> None:
+        global DP_AXIS, EP_AXIS
+        world_size = dist.get_world_size()
+        assert tp_size == 1, "Tensor parallel is not supported in MoE yet"
         assert (
-            dist.get_world_size() % (tp_size * pp_size) == 0
-        ), f"world size {dist.get_world_size()} is not divisible by tp_size {tp_size} * pp_size {pp_size}"
+            world_size % (tp_size * pp_size) == 0
+        ), f"world size {world_size} is not divisible by tp_size {tp_size} * pp_size {pp_size}"
 
         if enable_sequence_parallelism:
             assert tp_size > 1, "Sequence parallelism must be enabled when using tensor parallelism"
         assert (
-            dist.get_world_size() % (tp_size * pp_size) == 0
-        ), f"world size {dist.get_world_size()} is not divisible by tp_size {tp_size} * pp_size {pp_size}"
+            world_size % (tp_size * pp_size) == 0
+        ), f"world size {world_size} is not divisible by tp_size {tp_size} * pp_size {pp_size}"
         assert (
-            dist.get_world_size() % (tp_size * pp_size * ep_size) == 0
-        ), f"world size {dist.get_world_size()} is not divisible by tp_size {tp_size} * pp_size {pp_size} * ep_size {ep_size}"
-        self.real_dp_size = dist.get_world_size() // (tp_size * pp_size * ep_size)
-        MOE_MANAGER.setup(
-            parallel="EP",
-            mode="fixed",
-            fixed_dp_size=self.real_dp_size,
-            fixed_ep_size=ep_size,
-            fixed_pp_size=pp_size,
-            use_ep_inside=use_ep_inside,
-        )
+            world_size % (tp_size * pp_size * ep_size) == 0
+        ), f"world size {world_size} is not divisible by tp_size {tp_size} * pp_size {pp_size} * ep_size {ep_size}"
+        self.dp_size = world_size // (tp_size * pp_size)
         self.tp_size = tp_size
         self.pp_size = pp_size
-        self.dp_size = dist.get_world_size() // (tp_size * pp_size)
         self.ep_size = ep_size
-        self.moe_info = MOE_MANAGER.get_info(0)[1]
         self.precision = precision
         self.zero_stage = zero_stage
         self.cpu_offload = cpu_offload
@@ -219,28 +213,44 @@ def __init__(
         self.enable_jit_fused = enable_jit_fused
         self.enable_sequence_parallelism = enable_sequence_parallelism
         self.checkpoint_io = checkpoint_io
+
+        # NOTE: Two process meshes: global dp for non-moe param; dp + ep for moe param
+        # See https://hpc-ai.com/blog/enhanced-moe-parallelism-open-source-moe-model-training-can-be-9-times-more-efficient
         # we change pg mesh to (pp, dp, tp) for better moe performance
-        self.pg_mesh = ProcessGroupMesh(self.pp_size, self.dp_size, self.tp_size)
+        assert (
+            self.ep_size <= self.dp_size
+        ), f"Not enough devices({self.dp_size}) for expert parallelism size({self.ep_size})."
 
-        # sync moe in outer dp group, and sync other param in global dp group
-        if extra_dp_size > 1:
-            ep_size = self.dp_size // extra_dp_size
-            if use_ep_inside:
-                self.pg_mesh_moe = ProcessGroupMesh(self.pp_size, extra_dp_size, ep_size)
-                self.moe_extra_dp_group = self.pg_mesh_moe.get_group_along_axis(1)
-                if dist.get_rank() == 0:
-                    print(f"Zero Parallel: pp {self.pp_size}, outer_dp {extra_dp_size}, inner_dp {ep_size}")
-            else:
-                self.pg_mesh_moe = ProcessGroupMesh(self.pp_size, ep_size, extra_dp_size)
-                self.moe_extra_dp_group = self.pg_mesh_moe.get_group_along_axis(2)
-                if dist.get_rank() == 0:
-                    print(f"Zero Parallel: pp {self.pp_size}, outer_dp {ep_size}, inner_dp {extra_dp_size}")
+        self.moe_dp_size = self.dp_size // self.ep_size
+        self.use_ep_inside = use_ep_inside
+        if self.use_ep_inside:
+            self.pg_mesh = ProcessGroupMesh(self.pp_size, self.moe_dp_size, ep_size, tp_size)
+            self.moe_dp_group = self.pg_mesh.get_group_along_axis(DP_AXIS)
+            self.ep_group = self.pg_mesh.get_group_along_axis(EP_AXIS)
+            if dist.get_rank() == 0:
+                print(f"MoE Parallel: pp {self.pp_size}, outer_dp {self.moe_dp_size}, inner_ep {ep_size}, tp {tp_size}")
         else:
-            self.moe_extra_dp_group = None
+            warnings.warn("Using ep outside dp (cross-node) is strongly discouraged due to communication costs.")
+            self.pg_mesh = ProcessGroupMesh(self.pp_size, ep_size, self.moe_dp_size, tp_size)
+            EP_AXIS = 1
+            DP_AXIS = 2
+            self.moe_dp_group = self.pg_mesh.get_group_along_axis(DP_AXIS)
+            self.ep_group = self.pg_mesh.get_group_along_axis(EP_AXIS)
+            if dist.get_rank() == 0:
+                print(f"MoE Parallel: pp {self.pp_size}, outer_ep {ep_size}, inner_dp {self.moe_dp_size}, tp {tp_size}")
+        if dist.get_rank() == 0:
+            print(f"Non-MoE Parameter Parallel: pp {self.pp_size}, dp {self.dp_size}, tp {tp_size}")
+
+        self.tp_group = self.pg_mesh.get_group_along_axis(TP_AXIS)  # TODO: support custom tp size for mixtral lm head
+        self.global_dp_group = self.pg_mesh.get_group_along_axis((DP_AXIS, EP_AXIS))
+        self.pp_group = self.pg_mesh.get_group_along_axis(PP_AXIS)
+        # TODO: Currently moe only support partially sequence parallel
+        self.sp_group = self.pg_mesh.get_group_along_axis(TP_AXIS)
 
+        self.custom_policy = custom_policy
         self.stage_manager = None
         self.schedule = None
-        self.custom_policy = custom_policy
+
         assert zero_stage in (0, 1, 2)
         if self.pp_size > 1:
             assert (
@@ -251,9 +261,7 @@ def __init__(
             self.schedule = OneForwardOneBackwardSchedule(
                 self.stage_manager, num_microbatches=num_microbatches, microbatch_size=microbatch_size
             )
-        self.tp_group = self.pg_mesh.get_group_along_axis(TP_AXIS)
-        self.dp_group = self.pg_mesh.get_group_along_axis(DP_AXIS)
-        self.pp_group = self.pg_mesh.get_group_along_axis(PP_AXIS)
+
         self.shard_config = ShardConfig(
             tensor_parallel_process_group=self.tp_group,
             pipeline_stage_manager=self.stage_manager,
@@ -264,6 +272,7 @@ def __init__(
             enable_jit_fused=self.enable_jit_fused,
             enable_sequence_parallelism=enable_sequence_parallelism,
             enable_sequence_overlap=enable_sequence_overlap,
+            ep_group=self.ep_group,
         )
         self.amp_config = dict(
             initial_scale=initial_scale,
@@ -343,9 +352,18 @@ def seed_worker(worker_id):
 
     def get_checkpoint_io(self) -> MoECheckpointIO:
         if self.checkpoint_io is None:
-            self.checkpoint_io = MoECheckpointIO(self.dp_group, self.pp_group, self.tp_group, self.zero_stage)
+            self.checkpoint_io = MoECheckpointIO(self.global_dp_group, self.pp_group, self.tp_group, self.zero_stage)
         else:
-            self.checkpoint_io = self.checkpoint_io(self.dp_group, self.pp_group, self.tp_group, self.zero_stage)
+            self.checkpoint_io = self.checkpoint_io(
+                self.global_dp_group,
+                self.pp_group,
+                self.tp_group,
+                ep_group=self.ep_group,
+                moe_dp_group=self.moe_dp_group,
+                zero_stage=self.zero_stage,
+            )
+            if hasattr(self.checkpoint_io, "moe_info"):
+                self.checkpoint_io.moe_info = self.moe_info
         return self.checkpoint_io
 
     def configure(
@@ -363,7 +381,7 @@ def configure(
                 module=model,
                 precision=self.precision,
                 shard_config=self.shard_config,
-                dp_group=self.dp_group,
+                dp_group=self.global_dp_group,
                 tp_group=self.tp_group,
                 use_ddp=use_ddp,
                 ddp_config=self.ddp_config,
@@ -393,10 +411,10 @@ def configure(
                     model,
                     use_pipeline=self.enable_pipeline_parallelism,
                     param_info=param_info,
-                    dp_process_group=self.dp_group,
+                    dp_process_group=self.global_dp_group,
                     tp_process_group=self.tp_group,
                     pp_process_group=self.pp_group,
-                    moe_extra_dp_process_group=self.moe_extra_dp_group,
+                    moe_extra_dp_process_group=self.moe_dp_group,
                     verbose=True,
                     clip_grad_norm=self.max_norm,
                     **self.zero_config,
diff --git a/colossalai/booster/plugin/torch_fsdp_plugin.py b/colossalai/booster/plugin/torch_fsdp_plugin.py
index 5445b4a6349d..0aa0caa9aafe 100644
--- a/colossalai/booster/plugin/torch_fsdp_plugin.py
+++ b/colossalai/booster/plugin/torch_fsdp_plugin.py
@@ -27,7 +27,7 @@
 from torch.optim.lr_scheduler import _LRScheduler as LRScheduler
 from torch.utils.data import DataLoader
 
-from colossalai.checkpoint_io import CheckpointIO, GeneralCheckpointIO, utils, CheckpointIndexFile
+from colossalai.checkpoint_io import CheckpointIndexFile, CheckpointIO, GeneralCheckpointIO, utils
 from colossalai.cluster import DistCoordinator
 from colossalai.interface import ModelWrapper, OptimizerWrapper
 
@@ -93,9 +93,7 @@ def save_sharded_model(
 
         Path(checkpoint_path).mkdir(parents=True, exist_ok=True)
         with FSDP.state_dict_type(
-            model.unwrap(),
-            StateDictType.FULL_STATE_DICT,
-            FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+            model.unwrap(), StateDictType.FULL_STATE_DICT, FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
         ):
             state_dict = model.unwrap().state_dict()
 
@@ -172,7 +170,7 @@ def save_sharded_optimizer(
         with FSDP.state_dict_type(
             optimizer.unwrap_model().unwrap(),
             StateDictType.FULL_STATE_DICT,
-            FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+            FullStateDictConfig(offload_to_cpu=True, rank0_only=True),
         ):
             fsdp_optim_state = FSDP.full_optim_state_dict(
                 optimizer.unwrap_model().unwrap(), optim=optimizer, rank0_only=True
@@ -241,7 +239,6 @@ def load_sharded_optimizer(self, optimizer: Optimizer, index_file_path: str, siz
             )
             optimizer.load_state_dict(fsdp_state)
 
-
     def save_lr_scheduler(self, lr_scheduler: LRScheduler, checkpoint: str):
         """
         Save model to checkpoint but only on master process.
diff --git a/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py b/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
index 80822724982e..7156744c7ad4 100644
--- a/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
+++ b/colossalai/checkpoint_io/hybrid_parallel_checkpoint_io.py
@@ -63,13 +63,13 @@ def __init__(
         verbose: bool = True,
     ) -> None:
         super().__init__()
-        self.dp_group = dp_group
+        self.global_dp_group = dp_group
         self.pp_group = pp_group
         self.tp_group = tp_group
-        self.dp_rank = dist.get_rank(self.dp_group)
+        self.dp_rank = dist.get_rank(self.global_dp_group)
         self.tp_rank = dist.get_rank(self.tp_group)
         self.pp_rank = dist.get_rank(self.pp_group)
-        self.dp_size = dist.get_world_size(dp_group)
+        self.global_dp_size = dist.get_world_size(dp_group)
         self.pp_size = dist.get_world_size(pp_group)
         self.tp_size = dist.get_world_size(tp_group)
         self.use_zero = zero_stage > 0
@@ -424,7 +424,7 @@ def save_sharded_optimizer(
         state_dict_shard = HybridParallelCheckpointIO._optimizer_sharder(
             optimizer,
             use_zero=self.use_zero,
-            dp_group=self.dp_group,
+            dp_group=self.global_dp_group,
             tp_group=self.tp_group,
             size_per_shard=size_per_shard,
         )
@@ -525,96 +525,96 @@ def save_sharded_optimizer(
                         f"index located at {final_index_file_path}."
                     )
 
-    def load_sharded_optimizer(self, optimizer: OptimizerWrapper, checkpoint_index_file: str, prefix: str = ""):
-        """
-        Load sharded optimizer with the given path to index file of checkpoint folder.
-
-        Args:
-            optimizer (OptimizerWrapper): The optimizer to be loaded.
-            checkpoint_index_file (str): Path to the index file of checkpointing folder.
-            prefix (str): Not used.
-        """
-        assert isinstance(optimizer, OptimizerWrapper), "Please boost the optimizer before loading!"
-
-        def _get_param_id_from_optimizer_param(
-            param: torch.Tensor, master_to_working_map: Optional[Dict[int, torch.Tensor]] = None
-        ):
-            if master_to_working_map is not None:
-                working_param = master_to_working_map[id(param)]
-            else:
-                working_param = param
-            return optimizer.param_info["param2id"][id(working_param)]
-
-        # id_map is a mapping from param ids kept by current pipeline, to their corresponding parameter objects.
-        # When Zero is used, the mapped parameter objects should be fp32 master parameters.
-        # IDs should be obtained through saved param2id mapping earlier saved in optimizer.param_info.
-        id_map = {}
-        master_to_working_map = optimizer.get_master_to_working_map()
-        for pg in optimizer.optim.param_groups:
-            for param in pg["params"]:
-                param_id = _get_param_id_from_optimizer_param(param, master_to_working_map)
-                id_map[param_id] = param
-
-        # Read checkpoint index file.
-        ckpt_index_file = CheckpointIndexFile.from_file(checkpoint_index_file)
-        ckpt_root_path = ckpt_index_file.root_path
-        weight_map = ckpt_index_file.weight_map
-        weight_map = {int(k): v for k, v in weight_map.items()}  # convert saved id from str to int
-
-        # Load param_groups
-        param_group_path = ckpt_index_file.get_param_group_filename()
-        if param_group_path is None:
-            raise RuntimeError(
-                f"Invalid index file path {checkpoint_index_file} for an optimizer. \
-                               Lacking param group file under current directory."
-            )
-        saved_groups = torch.load(param_group_path)
-
-        updated_groups = []
-        for old_pg, saved_pg in zip(optimizer.optim.param_groups, saved_groups):
-            # obtain updated param group
-            new_pg = copy.deepcopy(saved_pg)
-            new_pg["params"] = old_pg["params"]  # The parameters in the same group shouldn't change.
-            updated_groups.append(new_pg)
-        optimizer.optim.__dict__.update({"param_groups": updated_groups})
-
-        # Load saved states to optimizer.
-        # Keep a record of loaded files so that file will not be repeatedly loaded.
-        loaded_file = set()
-        for pg in optimizer.optim.param_groups:
-            for param in pg["params"]:
-                if param is None:
-                    continue
-                param_id = _get_param_id_from_optimizer_param(param, master_to_working_map)
-                if param_id not in weight_map:
-                    continue
-                filename = weight_map[param_id]
-
-                # If this param's states has been loaded before, directly return.
-                if filename in loaded_file:
-                    continue
-
-                file_path = os.path.join(ckpt_root_path, filename)
-                state_dict = load_shard_state_dict(Path(file_path), use_safetensors=False)
-                load_states_into_optimizer(optimizer.optim, state_dict, id_map, strict=True)
-                loaded_file.add(filename)
-
-        # Then shard the loaded optimizer states if using tp/zero.
-        for param, state in optimizer.optim.state.items():
-            device = param.device
-            if master_to_working_map is not None:
-                working_param = master_to_working_map[id(param)]
-            else:
-                working_param = param
-            original_shape = optimizer.param_info["param2shape"][id(working_param)]
-            sharded_state = self.shard_from_complete_optimizer_state(
-                state, current_shape=working_param.shape, original_shape=original_shape, device=device, inplace=True
-            )
-            optimizer.optim.state[param] = sharded_state
-
-        sharded_optimizer_loading_epilogue(optimizer.optim)
-        if self.verbose and self.coordinator.is_master():
-            logging.info(f"The optimizer has been successfully loaded from sharded checkpoint: {ckpt_root_path}.")
+    # def load_sharded_optimizer(self, optimizer: OptimizerWrapper, checkpoint_index_file: str, prefix: str = ""):
+    #     """
+    #     Load sharded optimizer with the given path to index file of checkpoint folder.
+
+    #     Args:
+    #         optimizer (OptimizerWrapper): The optimizer to be loaded.
+    #         checkpoint_index_file (str): Path to the index file of checkpointing folder.
+    #         prefix (str): Not used.
+    #     """
+    #     assert isinstance(optimizer, OptimizerWrapper), "Please boost the optimizer before loading!"
+
+    #     def _get_param_id_from_optimizer_param(
+    #         param: torch.Tensor, master_to_working_map: Optional[Dict[int, torch.Tensor]] = None
+    #     ):
+    #         if master_to_working_map is not None:
+    #             working_param = master_to_working_map[id(param)]
+    #         else:
+    #             working_param = param
+    #         return optimizer.param_info["param2id"][id(working_param)]
+
+    #     # id_map is a mapping from param ids kept by current pipeline, to their corresponding parameter objects.
+    #     # When Zero is used, the mapped parameter objects should be fp32 master parameters.
+    #     # IDs should be obtained through saved param2id mapping earlier saved in optimizer.param_info.
+    #     id_map = {}
+    #     master_to_working_map = optimizer.get_master_to_working_map()
+    #     for pg in optimizer.optim.param_groups:
+    #         for param in pg["params"]:
+    #             param_id = _get_param_id_from_optimizer_param(param, master_to_working_map)
+    #             id_map[param_id] = param
+
+    #     # Read checkpoint index file.
+    #     ckpt_index_file = CheckpointIndexFile.from_file(checkpoint_index_file)
+    #     ckpt_root_path = ckpt_index_file.root_path
+    #     weight_map = ckpt_index_file.weight_map
+    #     weight_map = {int(k): v for k, v in weight_map.items()}  # convert saved id from str to int
+
+    #     # Load param_groups
+    #     param_group_path = ckpt_index_file.get_param_group_filename()
+    #     if param_group_path is None:
+    #         raise RuntimeError(
+    #             f"Invalid index file path {checkpoint_index_file} for an optimizer. \
+    #                            Lacking param group file under current directory."
+    #         )
+    #     saved_groups = torch.load(param_group_path)
+
+    #     updated_groups = []
+    #     for old_pg, saved_pg in zip(optimizer.optim.param_groups, saved_groups):
+    #         # obtain updated param group
+    #         new_pg = copy.deepcopy(saved_pg)
+    #         new_pg["params"] = old_pg["params"]  # The parameters in the same group shouldn't change.
+    #         updated_groups.append(new_pg)
+    #     optimizer.optim.__dict__.update({"param_groups": updated_groups})
+
+    #     # Load saved states to optimizer.
+    #     # Keep a record of loaded files so that file will not be repeatedly loaded.
+    #     loaded_file = set()
+    #     for pg in optimizer.optim.param_groups:
+    #         for param in pg["params"]:
+    #             if param is None:
+    #                 continue
+    #             param_id = _get_param_id_from_optimizer_param(param, master_to_working_map)
+    #             if param_id not in weight_map:
+    #                 continue
+    #             filename = weight_map[param_id]
+
+    #             # If this param's states has been loaded before, directly return.
+    #             if filename in loaded_file:
+    #                 continue
+
+    #             file_path = os.path.join(ckpt_root_path, filename)
+    #             state_dict = load_shard_state_dict(Path(file_path), use_safetensors=False)
+    #             load_states_into_optimizer(optimizer.optim, state_dict, id_map, strict=True)
+    #             loaded_file.add(filename)
+
+    #     # Then shard the loaded optimizer states if using tp/zero.
+    #     for param, state in optimizer.optim.state.items():
+    #         device = param.device
+    #         if master_to_working_map is not None:
+    #             working_param = master_to_working_map[id(param)]
+    #         else:
+    #             working_param = param
+    #         original_shape = optimizer.param_info["param2shape"][id(working_param)]
+    #         sharded_state = self.shard_from_complete_optimizer_state(
+    #             state, current_shape=working_param.shape, original_shape=original_shape, device=device, inplace=True
+    #         )
+    #         optimizer.optim.state[param] = sharded_state
+
+    #     sharded_optimizer_loading_epilogue(optimizer.optim)
+    #     if self.verbose and self.coordinator.is_master():
+    #         logging.info(f"The optimizer has been successfully loaded from sharded checkpoint: {ckpt_root_path}.")
 
     def save_unsharded_model(self, model: ModelWrapper, checkpoint: str, gather_dtensor: bool, use_safetensors: bool):
         """
@@ -718,7 +718,7 @@ def save_unsharded_optimizer(self, optimizer: OptimizerWrapper, checkpoint: str,
                 state,
                 working_param,
                 original_shape=original_shape,
-                dp_group=self.dp_group,
+                dp_group=self.global_dp_group,
                 tp_group=self.tp_group,
                 use_zero=self.use_zero,
                 inplace=False,
@@ -905,12 +905,12 @@ def shard_from_complete_optimizer_state(
 
                 # Shard state along data parallel group when using Zero.
                 if self.use_zero:
-                    padding_size = (self.dp_size - v.numel() % self.dp_size) % self.dp_size
+                    padding_size = (self.global_dp_size - v.numel() % self.global_dp_size) % self.global_dp_size
                     with torch.no_grad():
                         v = v.flatten()
                         if padding_size > 0:
                             v = torch.nn.functional.pad(v, [0, padding_size])
-                        slice_size = v.numel() // self.dp_size
+                        slice_size = v.numel() // self.global_dp_size
                         v = v.split(slice_size, dim=0)[self.dp_rank]
 
                 state_[k] = v.detach().clone().to(device)
diff --git a/colossalai/checkpoint_io/utils.py b/colossalai/checkpoint_io/utils.py
index e1800f29b0af..61351cab40b3 100644
--- a/colossalai/checkpoint_io/utils.py
+++ b/colossalai/checkpoint_io/utils.py
@@ -233,6 +233,7 @@ def save_state_dict_shards(
     shard_filenames = []
     for idx, shard_pair in enumerate(sharded_state_dict):
         shard, current_size = shard_pair
+        # Just loop over the sharder and gather to other ranks if not master
         if not is_master:
             del shard
             continue
@@ -294,6 +295,7 @@ def shard_optimizer_checkpoint(state_dict: dict, max_shard_size: int = 1024) ->
 # Helper functions for saving state dict
 # ======================================
 
+
 def save_state_dict(state_dict: dict, checkpoint_file_path: str, use_safetensors: bool) -> None:
     """
     Save state dict to checkpoint.
@@ -305,7 +307,7 @@ def save_state_dict(state_dict: dict, checkpoint_file_path: str, use_safetensors
     """
     # Move all tensors in the state_dict to CPU before saving to avoid serialization issues
     state_dict_cpu = tree_map(lambda x: x.cpu() if torch.is_tensor(x) else x, state_dict)
-    
+
     if use_safetensors:
         assert is_safetensors_available(), "safetensors is not available."
         assert checkpoint_file_path.endswith(
diff --git a/colossalai/cluster/process_group_mesh.py b/colossalai/cluster/process_group_mesh.py
index ae3956c693ab..e327e0108f39 100644
--- a/colossalai/cluster/process_group_mesh.py
+++ b/colossalai/cluster/process_group_mesh.py
@@ -161,7 +161,7 @@ def get_ranks_in_group(self, group: ProcessGroup) -> List[int]:
 
     @staticmethod
     def get_coords_along_axis(
-        base_coord: Tuple[int, ...], axis: int, indices_at_axis: List[int]
+        base_coord: Tuple[int, ...], axis: Union[int, List[int]], indices_at_axis: Union[List[int], List[List[int]]]
     ) -> List[Tuple[int, ...]]:
         """Get coordinates along the given axis.
 
@@ -173,13 +173,38 @@ def get_coords_along_axis(
         Returns:
             List[Tuple[int, ...]]: Coordinates along the axis.
         """
-        coords_in_group = []
-        for idx in indices_at_axis:
-            coords_in_group.append(base_coord[:axis] + (idx,) + base_coord[axis + 1 :])
+        if isinstance(axis, int):
+            axis = [
+                axis,
+            ]
+            assert isinstance(indices_at_axis[0], int)
+            indices_at_axis = [
+                indices_at_axis,
+            ]
+
+        def add_index(base_coord, axis, indices_at_axis):
+            coords_in_group = []
+            for idx in indices_at_axis:
+                coord = base_coord[:axis] + (idx,)
+                if axis + 1 < len(base_coord) and axis != -1:
+                    coord += base_coord[axis + 1 :]
+                coords_in_group.append(coord)
+            return coords_in_group
+
+        coords_in_group = [base_coord]
+        for ax, indices_at_ax in zip(axis, indices_at_axis):
+            new_coords_in_group = []
+            for coords in coords_in_group:
+                new_coords_in_group += add_index(coords, ax, indices_at_ax)
+            coords_in_group = new_coords_in_group
+
         return coords_in_group
 
     def create_group_along_axis(
-        self, axis: int, indices_at_axis: Optional[List[int]] = None, backend: Optional[str] = None
+        self,
+        axis: Union[int, List[int]],
+        indices_at_axis: Optional[Union[List[int], List[List[int]]]] = None,
+        backend: Optional[str] = None,
     ) -> ProcessGroup:
         """Create all process groups along the given axis, and return the one which the current process belongs to.
 
@@ -191,10 +216,21 @@ def create_group_along_axis(
         Returns:
             ProcessGroup: The process group along the given axis which the current process belongs to.
         """
-        indices_at_axis = indices_at_axis or list(range(self._shape[axis]))
+        if isinstance(axis, int):
+            axis = [
+                axis,
+            ]
+            if indices_at_axis is not None:
+                assert isinstance(indices_at_axis[0], int)
+                indices_at_axis = [
+                    indices_at_axis,
+                ]
+
+        indices_at_axis = indices_at_axis or [list(range(self._shape[ax])) for ax in axis]
         reduced_shape = list(self._shape)
         # the choices on the axis are reduced to 1, since it's determined by `indices_at_axis`
-        reduced_shape[axis] = 1
+        for ax in axis:
+            reduced_shape[ax] = 1
         target_group = None
         # use Cartesian product to generate all combinations of coordinates
         for base_coord in itertools.product(*[range(s) for s in reduced_shape]):
@@ -206,23 +242,31 @@ def create_group_along_axis(
         return target_group
 
     def get_group_along_axis(
-        self, axis: int, indices_at_axis: Optional[List[int]] = None, backend: Optional[str] = None
+        self, axis: Union[int, List[int]], indices_at_axis: Optional[List[int]] = None, backend: Optional[str] = None
     ) -> ProcessGroup:
         """Get the process group along the given axis which the current process belongs to. If the process group doesn't exist, it will be created.
 
         Args:
-            axis (int): Axis along which the process groups are created.
+            axis (int or list of int): Axes along which the process groups are created.
             indices_at_axis (Optional[List[int]], optional): Indices at the axis. Defaults to None.
             backend (Optional[str], optional): Backend of the process group. Defaults to None.
 
         Returns:
             ProcessGroup: The process group along the given axis which the current process belongs to.
         """
-        indices_at_axis = indices_at_axis or list(range(self._shape[axis]))
+        indices_at_axis = indices_at_axis
+        if indices_at_axis is None:
+            if isinstance(axis, (list, tuple)):
+                indices_at_axis = list(list(range(self._shape[ax])) for ax in axis)
+            else:
+                indices_at_axis = list(range(self._shape[axis]))
+
         coords_in_group = ProcessGroupMesh.get_coords_along_axis(self._coord, axis, indices_at_axis)
-        ranks_in_group = tuple([ProcessGroupMesh.ravel(coord, self._shape) for coord in coords_in_group])
+        try:
+            ranks_in_group = tuple([ProcessGroupMesh.ravel(coord, self._shape) for coord in coords_in_group])
+        except:
+            pass
         if ranks_in_group not in self._ranks_to_group:
             # no need to cache it explicitly, since it will be cached in `create_group_along_axis`
             return self.create_group_along_axis(axis, indices_at_axis, backend=backend)
         return self._ranks_to_group[ranks_in_group]
-    
\ No newline at end of file
diff --git a/colossalai/inference/engine/modeling/llama.py b/colossalai/inference/engine/modeling/llama.py
index b7bc94d0eae0..a7efb4026be0 100644
--- a/colossalai/inference/engine/modeling/llama.py
+++ b/colossalai/inference/engine/modeling/llama.py
@@ -29,13 +29,17 @@
 
 try:
     from colossalai.kernel.triton.flash_decoding import token_flash_decoding
+
     HAS_TRITON_FLASH_DECODING_KERNEL = True
 except:
-    print("no triton flash decoding support, please install lightllm from https://github.com/ModelTC/lightllm/blob/ece7b43f8a6dfa74027adc77c2c176cff28c76c8")
+    print(
+        "no triton flash decoding support, please install lightllm from https://github.com/ModelTC/lightllm/blob/ece7b43f8a6dfa74027adc77c2c176cff28c76c8"
+    )
     HAS_TRITON_FLASH_DECODING_KERNEL = False
-    
+
 try:
     from flash_attn import flash_attn_with_kvcache
+
     HAS_FLASH_KERNEL = True
 except:
     HAS_FLASH_KERNEL = False
@@ -48,6 +52,7 @@ def rotate_half(x):
     x2 = x[..., x.shape[-1] // 2 :]
     return torch.cat((-x2, x1), dim=-1)
 
+
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
     # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
     cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
@@ -96,17 +101,22 @@ def llama_triton_context_attention(
             infer_state.max_len_in_batch,
         )
 
-def llama_triton_token_attention(query_states, attn_output, infer_state, num_key_value_groups=1, q_head_num = -1, head_dim = -1):
+
+def llama_triton_token_attention(
+    query_states, attn_output, infer_state, num_key_value_groups=1, q_head_num=-1, head_dim=-1
+):
     if HAS_TRITON_FLASH_DECODING_KERNEL and q_head_num != -1 and head_dim != -1:
-        token_flash_decoding(q = query_states, 
-                                o_tensor = attn_output, 
-                                infer_state = infer_state, 
-                                q_head_num = q_head_num, 
-                                head_dim = head_dim, 
-                                cache_k = infer_state.cache_manager.key_buffer[infer_state.decode_layer_id], 
-                                cache_v = infer_state.cache_manager.value_buffer[infer_state.decode_layer_id])
-        return 
-    
+        token_flash_decoding(
+            q=query_states,
+            o_tensor=attn_output,
+            infer_state=infer_state,
+            q_head_num=q_head_num,
+            head_dim=head_dim,
+            cache_k=infer_state.cache_manager.key_buffer[infer_state.decode_layer_id],
+            cache_v=infer_state.cache_manager.value_buffer[infer_state.decode_layer_id],
+        )
+        return
+
     if num_key_value_groups == 1:
         token_attention_fwd(
             query_states,
@@ -459,14 +469,15 @@ def llama_flash_attn_kvcache_forward(
                 )
 
             if HAS_LIGHTLLM_KERNEL:
-                
                 attn_output = torch.empty_like(query_states)
-                llama_triton_token_attention(query_states = query_states, 
-                                             attn_output = attn_output, 
-                                             infer_state = infer_state, 
-                                             num_key_value_groups = self.num_key_value_groups, 
-                                             q_head_num = q_len * self.num_heads, 
-                                             head_dim = self.head_dim)
+                llama_triton_token_attention(
+                    query_states=query_states,
+                    attn_output=attn_output,
+                    infer_state=infer_state,
+                    num_key_value_groups=self.num_key_value_groups,
+                    q_head_num=q_len * self.num_heads,
+                    head_dim=self.head_dim,
+                )
             else:
                 self.num_heads // self.num_key_value_heads
                 cache_k = infer_state.cache_manager.key_buffer[infer_state.decode_layer_id]
diff --git a/colossalai/inference/quant/gptq/cai_gptq/cai_quant_linear.py b/colossalai/inference/quant/gptq/cai_gptq/cai_quant_linear.py
index ca12c34ed958..36339ac88486 100644
--- a/colossalai/inference/quant/gptq/cai_gptq/cai_quant_linear.py
+++ b/colossalai/inference/quant/gptq/cai_gptq/cai_quant_linear.py
@@ -18,15 +18,15 @@
 HAS_GPTQ_CUDA = False
 try:
     from colossalai.kernel.op_builder.gptq import GPTQBuilder
+
     gptq_cuda = GPTQBuilder().load()
     HAS_GPTQ_CUDA = True
 except ImportError:
-    warnings.warn('CUDA gptq is not installed')
+    warnings.warn("CUDA gptq is not installed")
     HAS_GPTQ_CUDA = False
 
 
 class CaiQuantLinear(nn.Module):
-
     def __init__(self, bits, groupsize, infeatures, outfeatures, bias, tp_size=1, tp_rank=0, row_split=False):
         super().__init__()
         if bits not in [2, 4, 8]:
@@ -37,23 +37,28 @@ def __init__(self, bits, groupsize, infeatures, outfeatures, bias, tp_size=1, tp
         self.maxq = 2**self.bits - 1
         self.groupsize = groupsize if groupsize != -1 else infeatures
 
-        self.register_buffer('qweight', torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32))
+        self.register_buffer("qweight", torch.zeros((infeatures // 32 * self.bits, outfeatures), dtype=torch.int32))
+        self.register_buffer(
+            "qzeros",
+            torch.zeros((math.ceil(infeatures / self.groupsize), outfeatures // 32 * self.bits), dtype=torch.int32),
+        )
         self.register_buffer(
-            'qzeros',
-            torch.zeros((math.ceil(infeatures / self.groupsize), outfeatures // 32 * self.bits), dtype=torch.int32))
-        self.register_buffer('scales',
-                             torch.zeros((math.ceil(infeatures / self.groupsize), outfeatures), dtype=torch.float16))
+            "scales", torch.zeros((math.ceil(infeatures / self.groupsize), outfeatures), dtype=torch.float16)
+        )
         if row_split:
             self.register_buffer(
-                'g_idx',
-                torch.tensor([(i + (tp_rank * self.infeatures)) // self.groupsize for i in range(infeatures)],
-                             dtype=torch.int32))
+                "g_idx",
+                torch.tensor(
+                    [(i + (tp_rank * self.infeatures)) // self.groupsize for i in range(infeatures)], dtype=torch.int32
+                ),
+            )
         else:
-            self.register_buffer('g_idx',
-                                 torch.tensor([i // self.groupsize for i in range(infeatures)], dtype=torch.int32))
+            self.register_buffer(
+                "g_idx", torch.tensor([i // self.groupsize for i in range(infeatures)], dtype=torch.int32)
+            )
 
         if bias:
-            self.register_buffer('bias', torch.zeros((outfeatures), dtype=torch.float16))
+            self.register_buffer("bias", torch.zeros((outfeatures), dtype=torch.float16))
         else:
             self.bias = None
 
@@ -66,9 +71,11 @@ def __init__(self, bits, groupsize, infeatures, outfeatures, bias, tp_size=1, tp
         self.row_split = row_split
 
     def pack(self, linear, scales, zeros, g_idx=None):
-
-        g_idx = g_idx.clone() if g_idx is not None else torch.tensor(
-            [i // self.groupsize for i in range(self.infeatures)], dtype=torch.int32)
+        g_idx = (
+            g_idx.clone()
+            if g_idx is not None
+            else torch.tensor([i // self.groupsize for i in range(self.infeatures)], dtype=torch.int32)
+        )
 
         scales = scales.t().contiguous()
         zeros = zeros.t().contiguous()
@@ -79,7 +86,6 @@ def pack(self, linear, scales, zeros, g_idx=None):
         if linear.bias is not None:
             self.bias = linear.bias.clone().half()
 
-        wn = 8
         pbits = 32
         ptype = torch.int32
         unsign_type = np.uint32
@@ -88,9 +94,10 @@ def pack(self, linear, scales, zeros, g_idx=None):
         intweight = []
         for idx in range(self.infeatures):
             intweight.append(
-                torch.round(
-                    (linear.weight.data[:, idx] + scale_zeros[g_idx[idx]]) / half_scales[g_idx[idx]]).to(ptype)[:,
-                                                                                                                None])
+                torch.round((linear.weight.data[:, idx] + scale_zeros[g_idx[idx]]) / half_scales[g_idx[idx]]).to(ptype)[
+                    :, None
+                ]
+            )
         intweight = torch.cat(intweight, dim=1)
         intweight = intweight.t().contiguous()
         intweight = intweight.numpy().astype(unsign_type)
@@ -109,7 +116,7 @@ def pack(self, linear, scales, zeros, g_idx=None):
                 raise NotImplementedError("Only 2,4,8 bits are supported.")
         qweight = qweight.astype(sign_type)
         qweight1 = torch.from_numpy(qweight)
-        qweight1 = qweight1.contiguous()    #.to("cuda")
+        qweight1 = qweight1.contiguous()  # .to("cuda")
         self.qweight.data.copy_(qweight1)
 
         qzeros = np.zeros((zeros.shape[0], zeros.shape[1] // pbits * self.bits), dtype=unsign_type)
@@ -140,17 +147,20 @@ def init_q4(self):
         self.q4_width = self.qweight.shape[1]
         if self.g_idx is not None:
             if self.row_split and torch.equal(
-                    self.g_idx,
-                    torch.tensor(
-                        [(i + (self.tp_rank * self.infeatures)) // self.groupsize for i in range(self.infeatures)],
-                        dtype=torch.int32,
-                        device=self.g_idx.device)):
+                self.g_idx,
+                torch.tensor(
+                    [(i + (self.tp_rank * self.infeatures)) // self.groupsize for i in range(self.infeatures)],
+                    dtype=torch.int32,
+                    device=self.g_idx.device,
+                ),
+            ):
                 self.g_idx = None
             elif torch.equal(
-                    self.g_idx,
-                    torch.tensor([i // self.groupsize for i in range(self.infeatures)],
-                                 dtype=torch.int32,
-                                 device=self.g_idx.device)):
+                self.g_idx,
+                torch.tensor(
+                    [i // self.groupsize for i in range(self.infeatures)], dtype=torch.int32, device=self.g_idx.device
+                ),
+            ):
                 self.g_idx = None
 
         if self.g_idx is not None:
@@ -165,7 +175,6 @@ def forward(self, x):
         outshape = x.shape[:-1] + (self.outfeatures,)
 
         if HAS_GPTQ_CUDA and self.bits == 4:
-
             if self.q4 is None:
                 self.init_q4()
 
@@ -191,7 +200,6 @@ def forward(self, x):
 
 
 def split_column_copy(gptq_linear, cai_linear, tp_size=1, tp_rank=0, split_num=1):
-
     qweights = gptq_linear.qweight.split(gptq_linear.out_features // split_num, dim=-1)
     qzeros = gptq_linear.qzeros.split(gptq_linear.out_features // (32 // cai_linear.bits) // split_num, dim=-1)
     scales = gptq_linear.scales.split(gptq_linear.out_features // split_num, dim=-1)
@@ -203,24 +211,24 @@ def split_column_copy(gptq_linear, cai_linear, tp_size=1, tp_rank=0, split_num=1
     zero_split_block = cai_linear.outfeatures // (32 // cai_linear.bits) // split_num
 
     for i in range(split_num):
-        cai_linear.qweight[:, i * cai_split_out_features:(i + 1) *
-                           cai_split_out_features] = qweights[i][:, tp_rank * cai_split_out_features:(tp_rank + 1) *
-                                                                 cai_split_out_features]
-        cai_linear.qzeros[:, i * zero_split_block:(i + 1) *
-                          zero_split_block] = qzeros[i][:, tp_rank * zero_split_block:(tp_rank + 1) * zero_split_block]
-        cai_linear.scales[:, i * cai_split_out_features:(i + 1) *
-                          cai_split_out_features] = scales[i][:, tp_rank * cai_split_out_features:(tp_rank + 1) *
-                                                              cai_split_out_features]
+        cai_linear.qweight[:, i * cai_split_out_features : (i + 1) * cai_split_out_features] = qweights[i][
+            :, tp_rank * cai_split_out_features : (tp_rank + 1) * cai_split_out_features
+        ]
+        cai_linear.qzeros[:, i * zero_split_block : (i + 1) * zero_split_block] = qzeros[i][
+            :, tp_rank * zero_split_block : (tp_rank + 1) * zero_split_block
+        ]
+        cai_linear.scales[:, i * cai_split_out_features : (i + 1) * cai_split_out_features] = scales[i][
+            :, tp_rank * cai_split_out_features : (tp_rank + 1) * cai_split_out_features
+        ]
         if cai_linear.bias is not None:
-            cai_linear.bias[i * cai_split_out_features:(i + 1) *
-                            cai_split_out_features] = bias[i][tp_rank * cai_split_out_features:(tp_rank + 1) *
-                                                              cai_split_out_features]
+            cai_linear.bias[i * cai_split_out_features : (i + 1) * cai_split_out_features] = bias[i][
+                tp_rank * cai_split_out_features : (tp_rank + 1) * cai_split_out_features
+            ]
 
     cai_linear.g_idx.copy_(g_idx)
 
 
 def split_row_copy(gptq_linear, cai_linear, tp_rank=0, split_num=1):
-
     qweights = gptq_linear.qweight.split(gptq_linear.in_features // split_num, dim=0)
     qzeros = gptq_linear.qzeros.split(gptq_linear.in_features // split_num, dim=0)
     scales = gptq_linear.scales.split(gptq_linear.in_features // split_num, dim=0)
@@ -231,47 +239,40 @@ def split_row_copy(gptq_linear, cai_linear, tp_rank=0, split_num=1):
     idx_split_features = cai_linear.infeatures // split_num
 
     for i in range(split_num):
-        cai_linear.qweight[i * cai_split_in_features:(i + 1) *
-                           cai_split_in_features, :] = qweights[i][tp_rank * cai_split_in_features:(tp_rank + 1) *
-                                                                   cai_split_in_features, :]
-        cai_linear.qzeros[i * zero_split_block:(i + 1) *
-                          zero_split_block, :] = qzeros[i][tp_rank * zero_split_block:(tp_rank + 1) *
-                                                           zero_split_block, :]
-        cai_linear.scales[i * zero_split_block:(i + 1) *
-                          zero_split_block, :] = scales[i][tp_rank * zero_split_block:(tp_rank + 1) *
-                                                           zero_split_block, :]
-        cai_linear.g_idx[i * idx_split_features:(i + 1) *
-                         idx_split_features] = g_idxs[i][tp_rank * idx_split_features:(tp_rank + 1) *
-                                                         idx_split_features]
+        cai_linear.qweight[i * cai_split_in_features : (i + 1) * cai_split_in_features, :] = qweights[i][
+            tp_rank * cai_split_in_features : (tp_rank + 1) * cai_split_in_features, :
+        ]
+        cai_linear.qzeros[i * zero_split_block : (i + 1) * zero_split_block, :] = qzeros[i][
+            tp_rank * zero_split_block : (tp_rank + 1) * zero_split_block, :
+        ]
+        cai_linear.scales[i * zero_split_block : (i + 1) * zero_split_block, :] = scales[i][
+            tp_rank * zero_split_block : (tp_rank + 1) * zero_split_block, :
+        ]
+        cai_linear.g_idx[i * idx_split_features : (i + 1) * idx_split_features] = g_idxs[i][
+            tp_rank * idx_split_features : (tp_rank + 1) * idx_split_features
+        ]
     if cai_linear.bias is not None:
         cai_linear.bias.copy_(gptq_linear.bias)
 
 
 class RowCaiQuantLinear(CaiQuantLinear, ParallelModule):
-
     def __init__(self, bits, groupsize, infeatures, outfeatures, bias, tp_size=1, tp_rank=0, row_split=False):
-
-        super().__init__(bits,
-                         groupsize,
-                         infeatures,
-                         outfeatures,
-                         bias,
-                         tp_size=tp_size,
-                         tp_rank=tp_rank,
-                         row_split=row_split)
+        super().__init__(
+            bits, groupsize, infeatures, outfeatures, bias, tp_size=tp_size, tp_rank=tp_rank, row_split=row_split
+        )
         self.process_group = None
 
     @staticmethod
-    def from_native_module(module: nn.Module, process_group: Union[ProcessGroup, List[ProcessGroup]], *args,
-                           **kwargs) -> ParallelModule:
+    def from_native_module(
+        module: nn.Module, process_group: Union[ProcessGroup, List[ProcessGroup]], *args, **kwargs
+    ) -> ParallelModule:
         LazyInitContext.materialize(module)
         # get the attributes
         in_features = module.in_features
 
         # ensure only one process group is passed
         if isinstance(process_group, (list, tuple)):
-            assert len(process_group) == 1, \
-                f'Expected only one process group, got {len(process_group)}.'
+            assert len(process_group) == 1, f"Expected only one process group, got {len(process_group)}."
             process_group = process_group[0]
 
         tp_size = dist.get_world_size(process_group)
@@ -282,15 +283,18 @@ def from_native_module(module: nn.Module, process_group: Union[ProcessGroup, Lis
 
         if in_features % tp_size != 0:
             raise ValueError(
-                f"The size of in_features:{in_features} is not integer multiples of tensor parallel size: {tp_size}!")
-        linear_1d = RowCaiQuantLinear(module.bits,
-                                      module.group_size,
-                                      module.in_features // tp_size,
-                                      module.out_features,
-                                      module.bias is not None,
-                                      tp_size=tp_size,
-                                      tp_rank=tp_rank,
-                                      row_split=True)
+                f"The size of in_features:{in_features} is not integer multiples of tensor parallel size: {tp_size}!"
+            )
+        linear_1d = RowCaiQuantLinear(
+            module.bits,
+            module.group_size,
+            module.in_features // tp_size,
+            module.out_features,
+            module.bias is not None,
+            tp_size=tp_size,
+            tp_rank=tp_rank,
+            row_split=True,
+        )
         linear_1d.process_group = process_group
 
         split_row_copy(module, linear_1d, tp_rank=tp_rank, **kwargs)
@@ -306,30 +310,23 @@ def forward(self, x):
 
 
 class ColCaiQuantLinear(CaiQuantLinear, ParallelModule):
-
     def __init__(self, bits, groupsize, infeatures, outfeatures, bias, tp_size=1, tp_rank=0, row_split=False):
-
-        super().__init__(bits,
-                         groupsize,
-                         infeatures,
-                         outfeatures,
-                         bias,
-                         tp_size=tp_size,
-                         tp_rank=tp_rank,
-                         row_split=row_split)
+        super().__init__(
+            bits, groupsize, infeatures, outfeatures, bias, tp_size=tp_size, tp_rank=tp_rank, row_split=row_split
+        )
         self.process_group = None
 
     @staticmethod
-    def from_native_module(module: nn.Module, process_group: Union[ProcessGroup, List[ProcessGroup]], *args,
-                           **kwargs) -> ParallelModule:
+    def from_native_module(
+        module: nn.Module, process_group: Union[ProcessGroup, List[ProcessGroup]], *args, **kwargs
+    ) -> ParallelModule:
         LazyInitContext.materialize(module)
         # get the attributes
         in_features = module.in_features
 
         # ensure only one process group is passed
         if isinstance(process_group, (list, tuple)):
-            assert len(process_group) == 1, \
-                f'Expected only one process group, got {len(process_group)}.'
+            assert len(process_group) == 1, f"Expected only one process group, got {len(process_group)}."
             process_group = process_group[0]
 
         tp_size = dist.get_world_size(process_group)
@@ -340,14 +337,17 @@ def from_native_module(module: nn.Module, process_group: Union[ProcessGroup, Lis
 
         if in_features % tp_size != 0:
             raise ValueError(
-                f"The size of in_features:{in_features} is not integer multiples of tensor parallel size: {tp_size}!")
-        linear_1d = ColCaiQuantLinear(module.bits,
-                                      module.group_size,
-                                      module.in_features,
-                                      module.out_features // tp_size,
-                                      module.bias is not None,
-                                      tp_size=tp_size,
-                                      tp_rank=tp_rank)
+                f"The size of in_features:{in_features} is not integer multiples of tensor parallel size: {tp_size}!"
+            )
+        linear_1d = ColCaiQuantLinear(
+            module.bits,
+            module.group_size,
+            module.in_features,
+            module.out_features // tp_size,
+            module.bias is not None,
+            tp_size=tp_size,
+            tp_rank=tp_rank,
+        )
         linear_1d.process_group = process_group
 
         split_column_copy(module, linear_1d, tp_rank=tp_rank, **kwargs)
diff --git a/colossalai/kernel/triton/context_attention.py b/colossalai/kernel/triton/context_attention.py
index 3d9a23d2f5d2..1725581d637c 100644
--- a/colossalai/kernel/triton/context_attention.py
+++ b/colossalai/kernel/triton/context_attention.py
@@ -5,6 +5,7 @@
 try:
     import triton
     import triton.language as tl
+
     HAS_TRITON = True
 except ImportError:
     HAS_TRITON = False
@@ -16,6 +17,7 @@
     https://github.com/ModelTC/lightllm/blob/f093edc20683ac3ea1bca3fb5d8320a0dd36cf7b/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py#L10
     """
     if triton.__version__ < "2.1.0":
+
         @triton.jit
         def _context_flash_attention_kernel(
             Q,
@@ -131,29 +133,47 @@ def _context_flash_attention_kernel(
                 m_i = m_i_new
 
             off_o = (
-                (cur_batch_start_index + offs_m[:, None]) * stride_obs + cur_head * stride_oh + offs_d[None, :] * stride_od
+                (cur_batch_start_index + offs_m[:, None]) * stride_obs
+                + cur_head * stride_oh
+                + offs_d[None, :] * stride_od
             )
             out_ptrs = Out + off_o
             tl.store(out_ptrs, acc, mask=offs_m[:, None] < cur_batch_seq_len)
             return
+
     else:
         # this function is modified from https://github.com/ModelTC/lightllm/blob/main/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py#L11
         @triton.jit
         def _context_flash_attention_kernel_2(
-            Q, K, V, sm_scale, Alibi, B_Start_Loc, B_Seqlen,
-            Out, 
-            kv_group_num, 
-            stride_qbs, stride_qh, stride_qd,
-            stride_kbs, stride_kh, stride_kd,
-            stride_vbs, stride_vh, stride_vd,
-            stride_obs, stride_oh, stride_od,
-            BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr,
+            Q,
+            K,
+            V,
+            sm_scale,
+            Alibi,
+            B_Start_Loc,
+            B_Seqlen,
+            Out,
+            kv_group_num,
+            stride_qbs,
+            stride_qh,
+            stride_qd,
+            stride_kbs,
+            stride_kh,
+            stride_kd,
+            stride_vbs,
+            stride_vh,
+            stride_vd,
+            stride_obs,
+            stride_oh,
+            stride_od,
+            BLOCK_M: tl.constexpr,
+            BLOCK_DMODEL: tl.constexpr,
             BLOCK_N: tl.constexpr,
         ):
             cur_batch = tl.program_id(0)
             cur_head = tl.program_id(1)
             start_m = tl.program_id(2)
-            
+
             if kv_group_num is not None:
                 cur_kv_head = cur_head // kv_group_num
 
@@ -166,7 +186,11 @@ def _context_flash_attention_kernel_2(
             offs_n = tl.arange(0, BLOCK_N)
             offs_d = tl.arange(0, BLOCK_DMODEL)
             offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-            off_q = (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs + cur_head * stride_qh + offs_d[None, :] * stride_qd
+            off_q = (
+                (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs
+                + cur_head * stride_qh
+                + offs_d[None, :] * stride_qd
+            )
             if kv_group_num is None or kv_group_num == 1:
                 off_k = offs_n[None, :] * stride_kbs + cur_head * stride_kh + offs_d[:, None] * stride_kd
                 off_v = offs_n[:, None] * stride_vbs + cur_head * stride_vh + offs_d[None, :] * stride_vd
@@ -191,8 +215,11 @@ def _context_flash_attention_kernel_2(
             for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
                 start_n = tl.multiple_of(start_n, BLOCK_N)
                 # -- compute qk ----
-                k = tl.load(k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs,
-                            mask=(start_n + offs_n[None, :]) < cur_batch_seq_len, other=0.0)
+                k = tl.load(
+                    k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs,
+                    mask=(start_n + offs_n[None, :]) < cur_batch_seq_len,
+                    other=0.0,
+                )
 
                 qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
                 qk += tl.dot(q, k)
@@ -220,8 +247,11 @@ def _context_flash_attention_kernel_2(
                 acc_scale = l_i / l_i_new * alpha
                 acc = acc * acc_scale[:, None]
                 # update acc
-                v = tl.load(v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs,
-                            mask=(start_n + offs_n[:, None]) < cur_batch_seq_len, other=0.0)
+                v = tl.load(
+                    v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs,
+                    mask=(start_n + offs_n[:, None]) < cur_batch_seq_len,
+                    other=0.0,
+                )
 
                 p = p.to(v.dtype)
                 acc += tl.dot(p, v)
@@ -229,7 +259,11 @@ def _context_flash_attention_kernel_2(
                 l_i = l_i_new
                 m_i = m_i_new
             # initialize pointers to output
-            off_o = (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs + cur_head * stride_oh + offs_d[None, :] * stride_od
+            off_o = (
+                (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs
+                + cur_head * stride_oh
+                + offs_d[None, :] * stride_od
+            )
             out_ptrs = Out + off_o
             tl.store(out_ptrs, acc, mask=offs_m[:, None] < cur_batch_seq_len)
             return
@@ -249,7 +283,7 @@ def bloom_context_attn_fwd(q, k, v, o, b_start_loc, b_seq_len, max_input_len, al
         grid = (batch, head, triton.cdiv(max_input_len, BLOCK))
 
         num_warps = 4 if Lk <= 64 else 8
-        
+
         if triton.__version__ < "2.1.0":
             tmp = torch.empty((batch, head, max_input_len + 256), device=q.device, dtype=torch.float32)
             _context_flash_attention_kernel[grid](
@@ -286,20 +320,26 @@ def bloom_context_attn_fwd(q, k, v, o, b_start_loc, b_seq_len, max_input_len, al
             )
         else:
             _context_flash_attention_kernel_2[grid](
-                q, k, v, sm_scale, alibi, b_start_loc, b_seq_len,
+                q,
+                k,
+                v,
+                sm_scale,
+                alibi,
+                b_start_loc,
+                b_seq_len,
                 o,
                 None,
-                q.stride(0), 
-                q.stride(1), 
+                q.stride(0),
+                q.stride(1),
                 q.stride(2),
-                k.stride(0), 
-                k.stride(1), 
+                k.stride(0),
+                k.stride(1),
                 k.stride(2),
-                v.stride(0), 
-                v.stride(1), 
+                v.stride(0),
+                v.stride(1),
                 v.stride(2),
-                o.stride(0), 
-                o.stride(1), 
+                o.stride(0),
+                o.stride(1),
                 o.stride(2),
                 BLOCK_M=BLOCK,
                 BLOCK_DMODEL=Lk,
@@ -307,7 +347,7 @@ def bloom_context_attn_fwd(q, k, v, o, b_start_loc, b_seq_len, max_input_len, al
                 num_warps=num_warps,
                 num_stages=1,
             )
-            
+
         return
 
     @torch.no_grad()
@@ -327,7 +367,7 @@ def llama_context_attn_fwd(q, k, v, o, b_start_loc, b_seq_len, max_input_len):
         tmp = torch.empty((batch, head, max_input_len + 256), device=q.device, dtype=torch.float32)
         num_warps = 4 if Lk <= 64 else 8
         # num_warps = 4
-        
+
         if triton.__version__ < "2.1.0":
             _context_flash_attention_kernel[grid](
                 q,
@@ -337,7 +377,7 @@ def llama_context_attn_fwd(q, k, v, o, b_start_loc, b_seq_len, max_input_len):
                 b_start_loc,
                 b_seq_len,
                 tmp,
-                None, 
+                None,
                 o,
                 q.stride(0),
                 q.stride(1),
@@ -362,32 +402,33 @@ def llama_context_attn_fwd(q, k, v, o, b_start_loc, b_seq_len, max_input_len):
             )
         else:
             kv_group_num = q.shape[1] // k.shape[1]
-            _context_flash_attention_kernel_2[grid](                
-                q, 
-                k, 
-                v, 
-                sm_scale, 
+            _context_flash_attention_kernel_2[grid](
+                q,
+                k,
+                v,
+                sm_scale,
                 None,
-                b_start_loc, 
+                b_start_loc,
                 b_seq_len,
                 o,
                 kv_group_num,
-                q.stride(0), 
-                q.stride(1), 
+                q.stride(0),
+                q.stride(1),
                 q.stride(2),
-                k.stride(0), 
-                k.stride(1), 
+                k.stride(0),
+                k.stride(1),
                 k.stride(2),
-                v.stride(0), 
-                v.stride(1), 
+                v.stride(0),
+                v.stride(1),
                 v.stride(2),
-                o.stride(0), 
-                o.stride(1), 
+                o.stride(0),
+                o.stride(1),
                 o.stride(2),
                 BLOCK_M=BLOCK,
                 BLOCK_DMODEL=Lk,
                 BLOCK_N=BLOCK,
                 num_warps=num_warps,
-                num_stages=1,)
-            
-        return
\ No newline at end of file
+                num_stages=1,
+            )
+
+        return
diff --git a/colossalai/kernel/triton/flash_decoding.py b/colossalai/kernel/triton/flash_decoding.py
index 9b7b27fa1f49..ac733dede3b7 100644
--- a/colossalai/kernel/triton/flash_decoding.py
+++ b/colossalai/kernel/triton/flash_decoding.py
@@ -1,8 +1,10 @@
 # adepted from https://github.com/ModelTC/lightllm/blob/ece7b43f8a6dfa74027adc77c2c176cff28c76c8/lightllm/models/llama/triton_kernel/flash_decoding.py
 import torch
+
 try:
     from lightllm.models.llama.triton_kernel.flash_decoding_stage1 import flash_decode_stage1
     from lightllm.models.llama.triton_kernel.flash_decoding_stage2 import flash_decode_stage2
+
     HAS_LIGHTLLM_KERNEL = True
 except:
     print("install lightllm from https://github.com/ModelTC/lightllm/blob/ece7b43f8a6dfa74027adc77c2c176cff28c76c8")
@@ -10,41 +12,36 @@
 
 
 if HAS_LIGHTLLM_KERNEL:
+
     def token_flash_decoding(q, o_tensor, infer_state, q_head_num, head_dim, cache_k, cache_v):
         BLOCK_SEQ = 256
         batch_size = infer_state.batch_size
         max_len_in_batch = infer_state.max_len_in_batch
 
-
         calcu_shape1 = (batch_size, q_head_num, head_dim)
 
-        if getattr(infer_state, 'mid_o', None) is None:
-            infer_state.mid_o = torch.empty([batch_size, 
-                                            q_head_num, 
-                                            max_len_in_batch // BLOCK_SEQ + 1, 
-                                            head_dim], 
-                                            dtype=torch.float32, 
-                                            device="cuda")
-            infer_state.mid_o_logexpsum = torch.empty([batch_size, 
-                                            q_head_num,
-                                            max_len_in_batch // BLOCK_SEQ + 1], 
-                                            dtype=torch.float32, 
-                                            device="cuda")
+        if getattr(infer_state, "mid_o", None) is None:
+            infer_state.mid_o = torch.empty(
+                [batch_size, q_head_num, max_len_in_batch // BLOCK_SEQ + 1, head_dim],
+                dtype=torch.float32,
+                device="cuda",
+            )
+            infer_state.mid_o_logexpsum = torch.empty(
+                [batch_size, q_head_num, max_len_in_batch // BLOCK_SEQ + 1], dtype=torch.float32, device="cuda"
+            )
 
         mid_o = infer_state.mid_o
         mid_o_logexpsum = infer_state.mid_o_logexpsum
 
-        flash_decode_stage1(q.view(calcu_shape1),
-                                    cache_k,
-                                    cache_v,
-                                    infer_state.block_loc,
-                                    infer_state.seq_len,
-                                    infer_state.max_len_in_batch,
-                                    mid_o,
-                                    mid_o_logexpsum,
-                                    BLOCK_SEQ)
-        flash_decode_stage2(mid_o,
-                            mid_o_logexpsum, 
-                            infer_state.seq_len, 
-                            o_tensor.view(calcu_shape1), 
-                            BLOCK_SEQ)
+        flash_decode_stage1(
+            q.view(calcu_shape1),
+            cache_k,
+            cache_v,
+            infer_state.block_loc,
+            infer_state.seq_len,
+            infer_state.max_len_in_batch,
+            mid_o,
+            mid_o_logexpsum,
+            BLOCK_SEQ,
+        )
+        flash_decode_stage2(mid_o, mid_o_logexpsum, infer_state.seq_len, o_tensor.view(calcu_shape1), BLOCK_SEQ)
diff --git a/colossalai/kernel/triton/llama_act_combine_kernel.py b/colossalai/kernel/triton/llama_act_combine_kernel.py
index 45996c0dca53..7a2c7e8fbd74 100644
--- a/colossalai/kernel/triton/llama_act_combine_kernel.py
+++ b/colossalai/kernel/triton/llama_act_combine_kernel.py
@@ -8,6 +8,7 @@
 try:
     import triton
     import triton.language as tl
+
     HAS_TRITON = True
 except ImportError:
     HAS_TRITON = False
@@ -26,8 +27,8 @@ def _llama_act_combine_forward(
         X_GATE2,
         X_UP,
         Y,
-        stride,    # how much to increase the pointer when moving by 1 row
-        N,    # number of columns in X
+        stride,  # how much to increase the pointer when moving by 1 row
+        N,  # number of columns in X
         BLOCK_SIZE: tl.constexpr,
     ):
         # Map the program id to the row of X and Y it should compute.
@@ -41,9 +42,9 @@ def _llama_act_combine_forward(
         for off in range(0, N, BLOCK_SIZE):
             cols = off + tl.arange(0, BLOCK_SIZE)
             mask = cols < N
-            x_gate1 = tl.load(X_GATE1 + cols, mask=mask, other=0.)
-            x_gate2 = tl.load(X_GATE2 + cols, mask=mask, other=0.)
-            x_up = tl.load(X_UP + cols, mask=mask, other=0.)
+            x_gate1 = tl.load(X_GATE1 + cols, mask=mask, other=0.0)
+            x_gate2 = tl.load(X_GATE2 + cols, mask=mask, other=0.0)
+            x_up = tl.load(X_UP + cols, mask=mask, other=0.0)
             x_gate2_sigmoid = tl.sigmoid(x_gate2.to(tl.float32)).to(x_gate2.dtype)
             y = x_gate1 * x_gate2 * x_gate2_sigmoid * x_up
             # Write output
@@ -58,8 +59,8 @@ def _llama_act_combine_backward(
         X_GATE2_GRAD,
         X_UP_GRAD,
         Y_GRAD,
-        stride,    # how much to increase the pointer when moving by 1 row
-        N,    # number of columns in X
+        stride,  # how much to increase the pointer when moving by 1 row
+        N,  # number of columns in X
         BLOCK_SIZE: tl.constexpr,
     ):
         # Map the program id to the row of X and Y it should compute.
@@ -76,10 +77,10 @@ def _llama_act_combine_backward(
         for off in range(0, N, BLOCK_SIZE):
             cols = off + tl.arange(0, BLOCK_SIZE)
             mask = cols < N
-            x_gate1 = tl.load(X_GATE1 + cols, mask=mask, other=0.)
-            x_gate2 = tl.load(X_GATE2 + cols, mask=mask, other=0.)
-            x_up = tl.load(X_UP + cols, mask=mask, other=0.)
-            y_grad = tl.load(Y_GRAD + cols, mask=mask, other=0.)
+            x_gate1 = tl.load(X_GATE1 + cols, mask=mask, other=0.0)
+            x_gate2 = tl.load(X_GATE2 + cols, mask=mask, other=0.0)
+            x_up = tl.load(X_UP + cols, mask=mask, other=0.0)
+            y_grad = tl.load(Y_GRAD + cols, mask=mask, other=0.0)
 
             # forward: y = x_gate1 * x_gate2 * tl.sigmoid(x_gate2) * x_up
             x_gate2_sigmoid = tl.sigmoid(x_gate2.to(tl.float32)).to(x_gate2.dtype)
@@ -147,14 +148,9 @@ def forward(ctx: Any, x_gate: torch.Tensor, x_up: torch.Tensor, activation: str
             # restore setting
             ctx.M, ctx.N, ctx.BLOCK_SIZE, ctx.num_warps = M, N, BLOCK_SIZE, num_warps
             # enqueue kernel
-            _llama_act_combine_forward[(M,)](x_gate1,
-                                             x_gate2,
-                                             x_up,
-                                             y,
-                                             x_up.stride(-2),
-                                             N,
-                                             BLOCK_SIZE=BLOCK_SIZE,
-                                             num_warps=num_warps)
+            _llama_act_combine_forward[(M,)](
+                x_gate1, x_gate2, x_up, y, x_up.stride(-2), N, BLOCK_SIZE=BLOCK_SIZE, num_warps=num_warps
+            )
             return y
 
         @staticmethod
@@ -166,20 +162,25 @@ def backward(ctx: Any, *grad_outputs: Tensor) -> Tuple[Tensor, Tensor, None, Non
 
             # init grad
             y_grad = grad_outputs[0]
-            x_gate1_grad, x_gate2_grad, x_up_grad = torch.empty_like(x_gate1), torch.empty_like(
-                x_gate2), torch.empty_like(x_up)
+            x_gate1_grad, x_gate2_grad, x_up_grad = (
+                torch.empty_like(x_gate1),
+                torch.empty_like(x_gate2),
+                torch.empty_like(x_up),
+            )
 
             # enqueue kernel
-            _llama_act_combine_backward[(M,)](x_gate1,
-                                              x_gate2,
-                                              x_up,
-                                              x_gate1_grad,
-                                              x_gate2_grad,
-                                              x_up_grad,
-                                              y_grad,
-                                              x_up.stride(-2),
-                                              N,
-                                              BLOCK_SIZE=BLOCK_SIZE,
-                                              num_warps=num_warps)
+            _llama_act_combine_backward[(M,)](
+                x_gate1,
+                x_gate2,
+                x_up,
+                x_gate1_grad,
+                x_gate2_grad,
+                x_up_grad,
+                y_grad,
+                x_up.stride(-2),
+                N,
+                BLOCK_SIZE=BLOCK_SIZE,
+                num_warps=num_warps,
+            )
             x_gate_grad = torch.cat([x_gate1_grad, x_gate2_grad], dim=-1)
             return x_gate_grad, x_up_grad, None, None
diff --git a/colossalai/kernel/triton/token_attention_kernel.py b/colossalai/kernel/triton/token_attention_kernel.py
index de2003748e65..d8ac278c77dd 100644
--- a/colossalai/kernel/triton/token_attention_kernel.py
+++ b/colossalai/kernel/triton/token_attention_kernel.py
@@ -13,10 +13,18 @@
     print("please install triton from https://github.com/openai/triton")
 
 try:
-    from lightllm.models.llama.triton_kernel.token_attention_nopad_reduceV import token_att_fwd2 as lightllm_llama_token_att_fwd2
-    from lightllm.models.llama.triton_kernel.token_attention_nopad_att1 import token_att_fwd as lightllm_llama_token_att_fwd
-    from lightllm.models.llama.triton_kernel.token_attention_nopad_softmax import token_softmax_fwd as lightllm_llama_token_softmax_fwd
-    from lightllm.models.bloom.triton_kernel.token_attention_nopad_att1 import token_att_fwd as lightllm_bloom_token_att_fwd
+    from lightllm.models.bloom.triton_kernel.token_attention_nopad_att1 import (
+        token_att_fwd as lightllm_bloom_token_att_fwd,
+    )
+    from lightllm.models.llama.triton_kernel.token_attention_nopad_att1 import (
+        token_att_fwd as lightllm_llama_token_att_fwd,
+    )
+    from lightllm.models.llama.triton_kernel.token_attention_nopad_reduceV import (
+        token_att_fwd2 as lightllm_llama_token_att_fwd2,
+    )
+    from lightllm.models.llama.triton_kernel.token_attention_nopad_softmax import (
+        token_softmax_fwd as lightllm_llama_token_softmax_fwd,
+    )
 
     HAS_TRITON_TOKEN_ATTENTION = True
 except ImportError:
@@ -205,9 +213,7 @@ def token_attn(
 
         if triton.__version__ == "2.0.0":
             prob = torch.empty_like(att_m_tensor)
-            lightllm_llama_token_softmax_fwd(
-                att_m_tensor, kv_cache_start_loc, kv_cache_seq_len, prob, max_len_in_batch
-            )
+            lightllm_llama_token_softmax_fwd(att_m_tensor, kv_cache_start_loc, kv_cache_seq_len, prob, max_len_in_batch)
             att_m_tensor = None
 
             lightllm_llama_token_att_fwd2(
diff --git a/colossalai/legacy/inference/tensor_parallel/modeling/llama.py b/colossalai/legacy/inference/tensor_parallel/modeling/llama.py
index 448943b12c9e..21e5c5e7856b 100644
--- a/colossalai/legacy/inference/tensor_parallel/modeling/llama.py
+++ b/colossalai/legacy/inference/tensor_parallel/modeling/llama.py
@@ -8,7 +8,9 @@
 from colossalai.inference.tensor_parallel.batch_infer_state import BatchInferState
 from colossalai.kernel.triton import llama_context_attn_fwd, token_attention_fwd
 from colossalai.kernel.triton.token_attention_kernel import Llama2TokenAttentionForwards
+
 from ._utils import copy_kv_to_mem_cache
+
 try:
     from lightllm.models.llama.triton_kernel.context_flashattention_nopad import (
         context_attention_fwd as lightllm_llama_context_attention_fwd,
@@ -90,7 +92,7 @@ def llama_triton_token_attention(query_states, attn_output, infer_state, num_key
             # infer_state.cache_manager.past_key_values_length,
             infer_state.max_len_in_batch,
         )
-        
+
     else:
         Llama2TokenAttentionForwards.token_attn(
             query_states,
diff --git a/colossalai/moe/checkpoint.py b/colossalai/moe/checkpoint.py
index 59a0ec3f0c39..86438936b56d 100644
--- a/colossalai/moe/checkpoint.py
+++ b/colossalai/moe/checkpoint.py
@@ -9,200 +9,109 @@
 import torch.distributed as dist
 import torch.nn as nn
 from torch.distributed import ProcessGroup
+from torch.distributed.distributed_c10d import get_global_rank
 
-from colossalai.checkpoint_io import CheckpointIndexFile, HybridParallelCheckpointIO
+from colossalai.checkpoint_io import CheckpointIndexFile
+from colossalai.checkpoint_io.hybrid_parallel_checkpoint_io import HybridParallelCheckpointIO
+from colossalai.checkpoint_io.index_file import CheckpointIndexFile
 from colossalai.checkpoint_io.utils import (
     StateDictSharder,
     gather_distributed_param,
     get_model_base_filenames,
     get_optimizer_base_filenames,
-    is_safetensors_available,
     load_shard_state_dict,
     load_state_dict,
-    load_state_dict_into_model,
     load_states_into_optimizer,
     save_config_file,
     save_param_groups,
     save_state_dict,
     save_state_dict_shards,
+    search_tp_partition_dim,
     sharded_optimizer_loading_epilogue,
 )
-from colossalai.interface import OptimizerWrapper
-from colossalai.moe.manager import MOE_MANAGER
-from colossalai.tensor.moe_tensor.api import (
-    get_dp_group,
-    get_dp_rank,
-    get_dp_size,
-    get_ep_group,
-    get_ep_rank,
-    get_ep_size,
-    is_moe_tensor,
-)
+from colossalai.interface import ModelWrapper, OptimizerWrapper
+from colossalai.tensor.moe_tensor.api import is_moe_tensor
+
+try:
+    from torch.nn.modules.module import _EXTRA_STATE_KEY_SUFFIX
+except ImportError:
+    _EXTRA_STATE_KEY_SUFFIX = "_extra_state"
 
 
 class MoECheckpointIO(HybridParallelCheckpointIO):
     def __init__(
         self,
-        dp_group: ProcessGroup,
+        global_dp_group: ProcessGroup,
         pp_group: ProcessGroup,
         tp_group: ProcessGroup,
+        ep_group: ProcessGroup,
+        moe_dp_group: ProcessGroup,
         zero_stage: int,
+        verbose: bool = True,
     ) -> None:
-        assert zero_stage in [
-            0,
-            1,
-            2,
-        ], f"zero_stage should be 0 or 1 or 2, got {zero_stage}"
-        super().__init__(dp_group, pp_group, tp_group, zero_stage)
-        self.parallel = MOE_MANAGER.parallel
-
-    def pre_load_model(self, model: nn.Module, state_dict: dict) -> dict:
-        """
-        Preprocess state_dict before loading and slice the state_dict of MOE tensors.
-        """
-        for name, param in state_dict.items():
-            if ".experts." in name:
-                if name in dict(model.named_parameters()):
-                    model_param = dict(model.named_parameters())[name]
-                    if is_moe_tensor(model_param):
-                        ep_rank = get_ep_rank(model_param)
-                        ep_size = get_ep_size(model_param)
-                        expert_num = param.shape[0] // ep_size
-                        assert param.shape[0] % ep_size == 0
-                        param = param[ep_rank * expert_num : (ep_rank + 1) * expert_num]
-                        state_dict[name] = param
-        dist.barrier()
-        return state_dict
-
+        super().__init__(global_dp_group, pp_group, tp_group, zero_stage, verbose)
+        self.global_dp_group = global_dp_group
+        self.global_dp_rank = dist.get_rank(global_dp_group)
+        self.global_dp_size = dist.get_world_size(global_dp_group)
+        self.pp_group = pp_group
+        self.tp_group = tp_group
+
+        self.moe_dp_group = moe_dp_group
+        self.moe_dp_size = dist.get_world_size(moe_dp_group)
+        self.moe_dp_rank = dist.get_rank(moe_dp_group)
+        self.ep_group = ep_group
+        self.ep_size = dist.get_world_size(ep_group)
+        self.ep_rank = dist.get_rank(ep_group)
+
+    @staticmethod
     def _model_sharder(
-        self,
-        state_dict: nn.Module,
+        model: nn.Module,
         prefix: str = "",
         keep_vars: bool = False,
         size_per_shard: int = 1024,
+        param_name_pattern: Optional[str] = None,
     ) -> Iterator[Tuple[OrderedDict, int]]:
         # An internel method that breaks state_dict of model into shards within limited size.
+
         state_dict_sharder = StateDictSharder(size_per_shard)
 
-        for name, param in state_dict.items():
+        # Save parameters.
+        for name, param in model.named_parameters():
             if param is None:
                 continue
+            if param_name_pattern is not None and param_name_pattern not in name:
+                continue
             # Gather tensor pieces when using tensor parallel.
             param_ = gather_distributed_param(param, keep_vars=False)
             block, block_size = state_dict_sharder.append_param(prefix + name, param_)
             if block is not None:
                 yield block, block_size
 
+        # Save buffers.
+        for name, buf in model.named_buffers():
+            if buf is not None and name not in model._non_persistent_buffers_set:
+                buffer = buf if keep_vars else buf.detach()
+                block, block_size = state_dict_sharder.append_param(prefix + name, buffer)
+                if block is not None:
+                    yield block, block_size
+
+        # Save extra states.
+        extra_state_key = prefix + _EXTRA_STATE_KEY_SUFFIX
+        if (
+            getattr(model.__class__, "get_extra_state", torch.nn.Module.get_extra_state)
+            is not torch.nn.Module.get_extra_state
+        ):
+            extra_state = model.get_extra_state()
+            block, block_size = state_dict_sharder.append_param(extra_state_key, extra_state)
+            if block is not None:
+                yield block, block_size
+
         # Return the last block in sharder.
         yield state_dict_sharder.current_block, state_dict_sharder.current_block_size
 
-    def load_unsharded_model(self, model: nn.Module, checkpoint: str, strict: bool) -> None:
-        state_dict = torch.load(checkpoint)
-        state_dict = self.pre_load_model(model, state_dict)
-        model.load_state_dict(state_dict, strict=strict if self.pp_size == 1 else False)
-
-    def load_sharded_model(self, model: nn.Module, checkpoint_index_file: Path, strict: bool = False):
-        """
-        Load sharded model with the given path to index file of checkpoint folder.
-
-        Args:
-            model (nn.Module): The model to be loaded.
-            checkpoint_index_file (str): Path to the index file of checkpointing folder.
-            strict (bool, optional): For name matching during loading state_dict. Defaults to False.
-                                     This argument should be manually set to False since params on same device might be stored in different files.
-        """
-
-        # Check whether the checkpoint uses safetensors.
-        use_safetensors = False
-        if "safetensors" in checkpoint_index_file.name:
-            use_safetensors = True
-
-        if use_safetensors and not is_safetensors_available():
-            raise ImportError("`safe_serialization` requires the `safetensors` library: `pip install safetensors`.")
-
-        # Read checkpoint index file.
-        ckpt_index_file = CheckpointIndexFile.from_file(checkpoint_index_file)
-        ckpt_root_path = ckpt_index_file.root_path
-        weight_map = ckpt_index_file.weight_map
-        strict = False
-
-        # Load params & buffers to model.
-        # Keep a record of loaded files so that file will not be repeatedly loaded.
-        loaded_file = set()
-
-        def _load(name: str):
-            if name not in weight_map:
-                raise ValueError(f"{name} is not stored in checkpoint, please check your checkpointing configuration!")
-            filename = weight_map[name]
-
-            # If this param/buffer has been loaded before, directly return.
-            if filename in loaded_file:
-                return
-
-            file_path = os.path.join(ckpt_root_path, filename)
-            state_dict = load_shard_state_dict(Path(file_path), use_safetensors)
-            state_dict = self.pre_load_model(model, state_dict)
-            missing_keys = []
-
-            load_state_dict_into_model(
-                model,
-                state_dict,
-                missing_keys=missing_keys,
-                strict=strict,
-                load_sub_module=True,
-            )
-            loaded_file.add(filename)
-
-        # Load parameters.
-        for name, _ in model.named_parameters():
-            _load(name)
-
-        if self.verbose:
-            logging.info(f"The model has been successfully loaded from sharded checkpoint: {ckpt_root_path}.")
-
-    def pre_save_model(self, model: nn.Module) -> dict:
-        state_dict = model.state_dict()
-        for name, param in model.named_parameters():
-            if ".experts." in name and is_moe_tensor(param):
-                ep_group = get_ep_group(param)
-                ep_rank = get_ep_rank(param)
-                ep_size = get_ep_size(param)
-                dp_rank = get_dp_rank(param)
-                if dp_rank == 0:
-                    param = param.data.cuda()
-                    all_param = [torch.zeros_like(param) for _ in range(ep_size)]
-                    # gather param from every ep rank
-                    dist.all_gather(all_param, param, group=ep_group)
-                    if ep_rank == 0:
-                        all_param = torch.cat(all_param, dim=0)
-                        state_dict[name] = all_param.cpu()
-        if self.pp_size > 1:
-            if self.dp_rank == 0:
-                out = [None for _ in range(self.pp_size)]
-                dist.all_gather_object(out, state_dict, group=self.pp_group)
-                if self.pp_rank == 0:
-                    new_state_dict = {}
-                    for o in out:
-                        new_state_dict.update(o)
-                    state_dict = new_state_dict
-        dist.barrier()
-        return state_dict
-
-    def save_unsharded_model(
-        self,
-        model: nn.Module,
-        checkpoint: str,
-        gather_dtensor: bool,
-        use_safetensors: bool,
-    ):
-        state_dict = self.pre_save_model(model)
-        if dist.get_rank() == 0:
-            torch.save(state_dict, checkpoint)
-        dist.barrier()
-
     def save_sharded_model(
         self,
-        model: nn.Module,
+        model: ModelWrapper,
         checkpoint: str,
         gather_dtensor: bool = True,
         prefix: Optional[str] = None,
@@ -214,7 +123,9 @@ def save_sharded_model(
         The following files will be created under the path:
         - An index file (pytorch_model.bin.index.json) containing a map between model params/buffers and file names.
         - Multiple files that store state tensors of models.
-          The filenames are in the form of "pytorch_model.<prefix>-000XX.bin"
+          If pipeline parallelism is used, the filenames are in the form of "pytorch_model.<prefix>-stage-000XX-shard-000XX.bin".
+          If pipeline parallelism is not used, "pytorch_model.<prefix>-000XX.bin"
+
 
         Args:
             model (nn.Module): Model on local device to be saved.
@@ -224,29 +135,35 @@ def save_sharded_model(
             size_per_shard (int, optional): Size per shard in MB. Defaults to 1024.
             use_safetensors (bool, optional): Whether to use safe tensors. Defaults to False.
         """
-        torch.cuda.empty_cache()
+
+        assert isinstance(model, ModelWrapper), "Please boost the model before saving!"
+        model = model.unwrap()
+
         if os.path.isfile(checkpoint):
             logging.error(f"Provided path ({checkpoint}) should be a directory, not a file")
             return
 
         Path(checkpoint).mkdir(parents=True, exist_ok=True)
 
-        # Then collect the sharded parameters & buffers along tp_group.
-        # Only devices with tp_rank == 0 are responsible for model saving.
-        state_dict = self.pre_save_model(model)
-
-        if dist.get_rank() == 0:
-            state_dict_shard = self._model_sharder(state_dict, size_per_shard=size_per_shard)
+        if self.moe_dp_rank != 0:
+            dist.barrier()
+            return
 
-            # Devices along the same dp_group share the same copies of model.
-            # So only let the device with dp_rank == 0 save the model.
-            if self.dp_rank != 0:
-                return
+        # ep_rank 0 saves all the parameters and buffers.
+        # other ep_ranks save only experts
+        ep_param_pattern = "experts." if self.ep_rank != 0 else None
 
-            weights_name, save_index_file = get_model_base_filenames(prefix, use_safetensors)
-            index_file = CheckpointIndexFile(checkpoint)
-            control_saving = self.tp_rank == 0
+        # Then collect the sharded parameters & buffers along tp_group.
+        # Only devices with tp_rank == 0 are responsible for model saving.
+        state_dict_shard = MoECheckpointIO._model_sharder(
+            model, size_per_shard=size_per_shard, param_name_pattern=ep_param_pattern
+        )
+        weights_name, save_index_file = get_model_base_filenames(prefix, use_safetensors)
+        index_file = CheckpointIndexFile(checkpoint)
+        control_saving = self.tp_rank == 0
 
+        if self.pp_size == 1 and self.ep_size == 1:
+            # When pipeline is not used, save the model shards as in general checkpointIO
             total_size = save_state_dict_shards(
                 sharded_state_dict=state_dict_shard,
                 checkpoint=checkpoint,
@@ -259,264 +176,81 @@ def save_sharded_model(
                 index_file.append_meta_data("total_size", total_size)
                 index_file.write_index_file(save_index_file)
                 save_config_file(model, checkpoint)
-                if self.verbose:
+                if self.verbose and self.coordinator.is_master():
                     logging.info(
                         f"The model is split into checkpoint shards. "
                         f"You can find where each parameters has been saved in the "
                         f"index located at {save_index_file}."
                     )
-        dist.barrier()
-        torch.cuda.empty_cache()
-
-    # ========================================================
-    # Abstract methods for optimizer loading/saving implementation
-    # ========================================================
-
-    def pre_load_optim(
-        self,
-        state: OrderedDict,
-        working_param,
-        current_shape: torch.Size,
-        original_shape: torch.Size,
-        device: torch.device,
-        inplace: bool,
-    ) -> OrderedDict:
-        """
-        With complete optimizer states of a specific parameter loaded from checkpoint,
-        slice out the sharded optimizer states kept by current device.
-
-        Args:
-            state (OrderedDict): Complete optimizer states of a given parameter, loaded from checkpoint.
-            current_shape (torch.Size): The size of parameter after sharding.
-            original_shape (torch.Size): The size of parameter before sharding.
-            device (torch.device): The destination device of loaded optimizer states.
-            inplace (bool): If set to True, will update the values of argument 'state' in place. Else will make a copy of state.
-
-        Returns:
-            OrderedDict: The sharded optimizer state of the given parameter.
-        """
-        state_ = state if inplace else copy.deepcopy(state)
-        is_moe_tensor_flag = is_moe_tensor(working_param)
-        if is_moe_tensor_flag:
-            ep_rank = get_ep_rank(working_param)
-            ep_size = get_ep_size(working_param)
-
-        for k, v in state_.items():
-            if isinstance(v, torch.Tensor) and k != "step":
-                if is_moe_tensor_flag:
-                    with torch.no_grad():
-                        expert_num = v.shape[0] // ep_size
-                        assert v.shape[0] % ep_size == 0
-                        v = v[ep_rank * expert_num : (ep_rank + 1) * expert_num]
-                else:
-                    # Shard state along data parallel group when using Zero.
-                    padding_size = (self.dp_size - v.numel() % self.dp_size) % self.dp_size
-                    with torch.no_grad():
-                        v = v.flatten()
-                        if padding_size > 0:
-                            v = torch.nn.functional.pad(v, [0, padding_size])
-                        slice_size = v.numel() // self.dp_size
-                        v = v.split(slice_size, dim=0)[self.dp_rank]
-
-                state_[k] = v.detach().clone().to(device)
-
-        return state_
-
-    def load_sharded_optimizer(self, optimizer: OptimizerWrapper, checkpoint_index_file: str, prefix: str = ""):
-        """
-        Load sharded optimizer with the given path to index file of checkpoint folder.
-
-        Args:
-            optimizer (OptimizerWrapper): The optimizer to be loaded.
-            checkpoint_index_file (str): Path to the index file of checkpointing folder.
-            prefix (str): Not used.
-        """
-        assert isinstance(optimizer, OptimizerWrapper), "Please boost the optimizer before loading!"
-
-        def _get_param_id_from_optimizer_param(
-            param: torch.Tensor, master_to_working_map: Optional[Dict[int, torch.Tensor]] = None, optimizer=None
-        ):
-            if master_to_working_map is not None and id(param) in master_to_working_map:
-                working_param = master_to_working_map[id(param)]
-            elif hasattr(optimizer, "moe_master_to_working_map") and id(param) in optimizer.moe_master_to_working_map:
-                working_param = optimizer.moe_master_to_working_map[id(param)]
-            else:
-                working_param = param
-            return optimizer.param_info["param2id"][id(working_param)]
 
-        # id_map is a mapping from param ids kept by current pipeline, to their corresponding parameter objects.
-        # When Zero is used, the mapped parameter objects should be fp32 master parameters.
-        # IDs should be obtained through saved param2id mapping earlier saved in optimizer.param_info.
-        id_map = {}
-        master_to_working_map = optimizer.get_master_to_working_map()
-        for pg in optimizer.optim.param_groups:
-            for param in pg["params"]:
-                param_id = _get_param_id_from_optimizer_param(param, master_to_working_map, optimizer)
-                id_map[param_id] = param
+            dist.barrier()
+        else:
+            # When pipeline is used, each stage produces its own shard files and index files.
+            # Index files belonging to each stage are saved under a temporary folder ./tmp_index_files/
+            # After all the state_dicts have been saved, the master rank integrates all the index files into one final index file and deletes the tmp folder.
 
-        # Read checkpoint index file.
-        ckpt_index_file = CheckpointIndexFile.from_file(checkpoint_index_file)
-        ckpt_root_path = ckpt_index_file.root_path
-        weight_map = ckpt_index_file.weight_map
-        weight_map = {int(k): v for k, v in weight_map.items()}  # convert saved id from str to int
+            final_index_file_path = copy.deepcopy(save_index_file)
+            tmp_index_file_folder = os.path.join(checkpoint, "tmp_index_files")
+            Path(tmp_index_file_folder).mkdir(parents=True, exist_ok=True)
 
-        # Load param_groups
-        param_group_path = ckpt_index_file.get_param_group_filename()
-        if param_group_path is None:
-            raise RuntimeError(
-                f"Invalid index file path {checkpoint_index_file} for an optimizer. \
-                               Lacking param group file under current directory."
+            # Manage filenames of sharded weights and index file for each pipeline stage.
+            weights_name = weights_name.replace(".bin", f"-stage-{self.pp_rank+1:05d}-{self.ep_rank+1:05d}-shard.bin")
+            weights_name = weights_name.replace(
+                ".safetensors", f"-stage-{self.pp_rank+1:05d}-{self.ep_rank+1:05d}-shard.safetensors"
             )
-        saved_groups = torch.load(param_group_path)
-
-        updated_groups = []
-        for old_pg, saved_pg in zip(optimizer.optim.param_groups, saved_groups):
-            # obtain updated param group
-            new_pg = copy.deepcopy(saved_pg)
-            new_pg["params"] = old_pg["params"]  # The parameters in the same group shouldn't change.
-            updated_groups.append(new_pg)
-        # ep param group
-        if len(optimizer.optim.param_groups) > len(saved_groups):
-            new_pg = copy.deepcopy(saved_pg)
-            new_pg["params"] = optimizer.optim.param_groups[-1]["params"]
-            updated_groups.append(new_pg)
-        optimizer.optim.__dict__.update({"param_groups": updated_groups})
-
-        # Load saved states to optimizer.
-        # Keep a record of loaded files so that file will not be repeatedly loaded.
-        loaded_file = set()
-        for pg in optimizer.optim.param_groups:
-            for param in pg["params"]:
-                if param is None:
-                    continue
-                param_id = _get_param_id_from_optimizer_param(param, master_to_working_map, optimizer)
-                if param_id not in weight_map:
-                    continue
-                filename = weight_map[param_id]
-
-                # If this param's states has been loaded before, directly return.
-                if filename in loaded_file:
-                    continue
-
-                file_path = os.path.join(ckpt_root_path, filename)
-                state_dict = load_shard_state_dict(Path(file_path), use_safetensors=False)
-
-                # Then shard the loaded optimizer states if using tp/zero.
-                for pid, state in list(state_dict.items()):
-                    if pid in id_map:
-                        param = id_map[pid]
-                        if master_to_working_map is not None and id(param) in master_to_working_map:
-                            working_param = master_to_working_map[id(param)]
-                        elif (
-                            hasattr(optimizer, "moe_master_to_working_map")
-                            and id(param) in optimizer.moe_master_to_working_map
-                        ):
-                            working_param = optimizer.moe_master_to_working_map[id(param)]
-                        else:
-                            working_param = param
-                        original_shape = optimizer.param_info["param2shape"][id(working_param)]
-                        sharded_state = self.pre_load_optim(
-                            state,
-                            working_param,
-                            current_shape=working_param.shape,
-                            original_shape=original_shape,
-                            device="cpu",
-                            inplace=True,
-                        )
-                        state_dict[pid] = sharded_state
-
-                load_states_into_optimizer(optimizer.optim, state_dict, id_map, strict=True)
-                loaded_file.add(filename)
-
-        sharded_optimizer_loading_epilogue(optimizer.optim)
-        if self.verbose and self.coordinator.is_master():
-            logging.info(f"The optimizer has been successfully loaded from sharded checkpoint: {ckpt_root_path}.")
-        dist.barrier()
-
-    def load_unsharded_optimizer(self, optimizer: OptimizerWrapper, checkpoint: str):
-        """
-        Load optimizer from a file with given path.
-
-        Args:
-            optimizer (OptimizerWrapper): The optimizer to be loaded.
-            checkpoint_index_file (str): Path to the checkpoint file.
-        """
+            save_index_file = save_index_file.replace(".json", f"-stage-{self.pp_rank+1:05d}-{self.ep_rank+1:05d}.json")
+            save_index_file = os.path.join("tmp_index_files", save_index_file)
 
-        def _get_param_id_from_optimizer_param(
-            param: torch.Tensor, master_to_working_map: Optional[Dict[int, torch.Tensor]] = None
-        ):
-            if master_to_working_map is not None and id(param) in master_to_working_map:
-                working_param = master_to_working_map[id(param)]
-            else:
-                working_param = param
-            if id(working_param) in optimizer.param_info["param2id"]:
-                return optimizer.param_info["param2id"][id(working_param)]
+            total_size = save_state_dict_shards(
+                sharded_state_dict=state_dict_shard,
+                checkpoint=checkpoint,
+                index_file=index_file,
+                base_filename=weights_name,
+                is_master=control_saving,
+                use_safetensors=use_safetensors,
+                use_pp_format=True,
+            )
+            if control_saving:
+                index_file.append_meta_data("total_size", total_size)
+                index_file.write_index_file(save_index_file)
             else:
-                None
-
-        if self.coordinator.is_master():
-            logging.warning("Please avoid using unsharded checkpointing methods when dealing with large models!")
-
-        assert isinstance(optimizer, OptimizerWrapper), "Please boost the optimizer before loading!"
+                dist.barrier()
+                return
 
-        # Complete optimizer state_dict loaded from checkpoint, need to be processed later.
-        state_dict = load_state_dict(checkpoint)
+            dist.barrier()
 
-        # Load param_groups.
-        updated_groups = []
-        saved_groups = state_dict["param_groups"]
-        for old_pg, saved_pg in zip(optimizer.optim.param_groups, saved_groups):
-            new_pg = copy.deepcopy(saved_pg)
-            new_pg["params"] = old_pg["params"]  # Only keep the parameters kept by current pipeline stage.
-            updated_groups.append(new_pg)
-        # ep extra group
-        if MOE_MANAGER.parallel == "EP":
-            new_pg = copy.deepcopy(saved_pg)
-            new_pg["params"] = optimizer.optim.param_groups[-1][
-                "params"
-            ]  # Only keep the parameters kept by current pipeline stage.
-            for param in new_pg["params"]:
-                param.data = param.data.to(torch.float32)
-            updated_groups.append(new_pg)
-        optimizer.optim.__dict__.update({"param_groups": updated_groups})
+            # The global master rank integrates the index files and clean the folder.
+            if self.coordinator.is_master():
+                final_index_file = CheckpointIndexFile(checkpoint)
+                final_index_file.append_meta_data("total_size", 0)
 
-        # Load saved states to optimizer. First discard those states not belonging to current pipeline stage.
-        master_to_working_map = optimizer.get_master_to_working_map()
-        id_map = {}
-        for pg in optimizer.optim.param_groups:
-            for param in pg["params"]:
-                param_id = _get_param_id_from_optimizer_param(param, master_to_working_map)
-                if param_id is not None:
-                    id_map[param_id] = param
-        load_states_into_optimizer(optimizer.optim, state_dict["state"], id_map, strict=True)
+                for filename in os.listdir(tmp_index_file_folder):
+                    stage_index_file = CheckpointIndexFile.from_file(os.path.join(tmp_index_file_folder, filename))
+                    final_index_file.metadata["total_size"] += stage_index_file.metadata["total_size"]
+                    for weight, weight_filename in stage_index_file.weight_map.items():
+                        final_index_file.append_weight_map(weight, weight_filename)
 
-        # Then shard the loaded optimizer states if using tp/zero.
-        for param, state in optimizer.optim.state.items():
-            if param is None:
-                continue
-            device = param.device
-            if master_to_working_map is not None and id(param) in master_to_working_map:
-                working_param = master_to_working_map[id(param)]
-            else:
-                working_param = param
-            original_shape = optimizer.param_info["param2shape"][id(working_param)]
-            sharded_state = self.pre_load_optim(
-                state,
-                param,
-                current_shape=working_param.shape,
-                original_shape=original_shape,
-                device=device,
-                inplace=True,
-            )
-            optimizer.optim.state[param] = sharded_state
-        sharded_optimizer_loading_epilogue(optimizer.optim)
-        dist.barrier()
+                final_index_file.write_index_file(final_index_file_path)
+                save_config_file(model, checkpoint)
+                rmtree(tmp_index_file_folder)
+                if self.verbose and self.coordinator.is_master():
+                    logging.info(
+                        f"The model is split into checkpoint shards. "
+                        f"You can find where each parameters has been saved in the "
+                        f"index located at {final_index_file_path}."
+                    )
 
-    def pre_save_optim(
-        self,
+    @staticmethod
+    def gather_from_sharded_optimizer_state(
         state: OrderedDict,
         param: torch.Tensor,
+        original_shape: torch.Size,
+        global_dp_group: ProcessGroup,
+        tp_group: ProcessGroup,
+        use_zero: bool,
         inplace: bool,
+        is_moe_param: bool,
+        moe_dp_group: ProcessGroup = None,
         device: torch.device = torch.device("cpu"),
     ) -> OrderedDict:
         """
@@ -526,7 +260,7 @@ def pre_save_optim(
             state (OrderedDict): Optimizer states of given parameter, might be distributed among tp/dp group if using TP/Zero.
             param (torch.Tensor): The given parameter. It should be working_param when using Zero.
             original_shape (torch.Size): The size of parameter before sharding.
-            dp_group (ProcessGroup): The process group of data parallel.
+            global_dp_group (ProcessGroup): The process group of data parallel.
             tp_group (ProcessGroup): The process group of tensor parallel.
             use_zero (bool): Whether Zero is used.
             inplace (bool): If set to True, will update the values of argument 'state' in place. Else will make a copy of state.
@@ -535,67 +269,92 @@ def pre_save_optim(
         Returns:
             OrderedDict: The complete optimizer state of given parameter.
         """
-        if is_moe_tensor(param):
-            moe_dp_group = get_dp_group(param)
-            moe_dp_size = get_dp_size(param)
-            moe_ep_group = get_ep_group(param)
-            moe_ep_size = get_ep_size(param)
+        global_dp_size = dist.get_world_size(global_dp_group)
+        tp_size = dist.get_world_size(tp_group)
+        moe_dp_size = dist.get_world_size(moe_dp_group) if moe_dp_group is not None else 1
+        current_shape = param.shape
         state_ = state if inplace else copy.deepcopy(state)
-
         for k, v in state_.items():
             if isinstance(v, torch.Tensor) and k != "step":
-                # moe param
-                if is_moe_tensor(param):
-                    # dp gather
-                    v = v.cuda()
-                    gather_tensor = [torch.zeros_like(v) for _ in range(moe_dp_size)]
-                    dist.all_gather(gather_tensor, v, group=moe_dp_group)
-                    v = torch.stack(gather_tensor).view(-1)[: param.numel()].reshape_as(param)
-                    # ep gather
-                    gather_tensor = [torch.zeros_like(v) for _ in range(moe_ep_size)]
-                    dist.all_gather(gather_tensor, v, group=moe_ep_group)
-                    v = torch.cat(gather_tensor, dim=0)
-                else:
-                    # global dp
-                    v = v.cuda()
-                    gather_tensor = [torch.zeros_like(v) for _ in range(dist.get_world_size(self.dp_group))]
-                    dist.all_gather(gather_tensor, v, group=self.dp_group)
-                    v = torch.stack(gather_tensor).view(-1)[: param.numel()].reshape_as(param)
-
+                v = v.cuda()
+
+                # First gather Zero shards.
+                if use_zero and is_moe_param and moe_dp_size > 1:
+                    moe_dp_rank = dist.get_rank(moe_dp_group)
+                    dst = get_global_rank(moe_dp_group, 0)
+                    if moe_dp_rank == 0:
+                        gather_tensor = [torch.zeros_like(v) for _ in range(moe_dp_size)]
+                        dist.gather(v, gather_tensor, group=moe_dp_group, dst=dst)
+                        v = torch.stack(gather_tensor).view(-1)[: param.numel()].reshape_as(param)
+                    else:
+                        dist.gather(v, group=moe_dp_group, dst=dst)
+
+                elif use_zero and not is_moe_param and global_dp_size > 1:
+                    dp_rank = dist.get_rank(global_dp_group)
+                    dst = get_global_rank(global_dp_group, 0)
+                    if dp_rank == 0:
+                        gather_tensor = [torch.zeros_like(v) for _ in range(global_dp_size)]
+                        dist.gather(v, gather_tensor, group=global_dp_group, dst=dst)
+                        v = torch.stack(gather_tensor).view(-1)[: param.numel()].reshape_as(param)
+                    else:
+                        dist.gather(v, group=global_dp_group, dst=dst)
+
+                # Then gather TP shards.
+                partition_dim = search_tp_partition_dim(current_shape, original_shape, tp_size)
+                if partition_dim is not None:
+                    tp_rank = dist.get_rank(tp_group)
+                    dst = get_global_rank(tp_group, 0)
+                    if tp_rank == 0:
+                        gather_tensor = [torch.zeros_like(v) for _ in range(tp_size)]
+                        dist.gather(v, gather_tensor, group=tp_group, dst=dst)
+                        v = torch.cat(gather_tensor, dim=partition_dim)
+                    else:
+                        dist.gather(v, group=tp_group, dst=dst)
                 state_[k] = v.detach().clone().to(device)
 
         return state_
 
+    @staticmethod
     def _optimizer_sharder(
-        self,
         optimizer: OptimizerWrapper,
+        use_zero: bool,
+        global_dp_group: ProcessGroup,
+        tp_group: ProcessGroup,
+        moe_dp_group: ProcessGroup,
         size_per_shard: int = 1024,
+        only_moe_param: bool = False,
     ):
         # An internel method that breaks state_dict of optimizer into shards within limited size.
 
         state_dict_sharder = StateDictSharder(size_per_shard)
         param_info = optimizer.param_info
         master_to_working_map = optimizer.get_master_to_working_map()
-
+        dist.get_world_size(moe_dp_group)
         for param, state in optimizer.optim.state.items():
             if param is None:
                 continue
 
-            if master_to_working_map is not None and id(param) in master_to_working_map:
+            if master_to_working_map is not None:
                 working_param = master_to_working_map[id(param)]
-            elif hasattr(optimizer, "moe_master_to_working_map") and id(param) in optimizer.moe_master_to_working_map:
-                working_param = optimizer.moe_master_to_working_map[id(param)]
             else:
                 working_param = param
-
             param_id = param_info["param2id"][id(working_param)]
-            state_ = self.pre_save_optim(
+            original_shape = param_info["param2shape"][id(working_param)]
+            state_ = MoECheckpointIO.gather_from_sharded_optimizer_state(
                 state,
                 working_param,
+                original_shape=original_shape,
+                global_dp_group=global_dp_group,
+                moe_dp_group=moe_dp_group,
+                tp_group=tp_group,
+                use_zero=use_zero,
                 inplace=False,
-                device=torch.device("cuda"),
+                is_moe_param=is_moe_tensor(working_param),  # TODO: Check correctness here
             )
 
+            if only_moe_param and not is_moe_tensor(working_param):
+                continue
+
             block, block_size = state_dict_sharder.append_optim_state(param_id, state_)
             if block is not None:
                 yield block, block_size
@@ -627,7 +386,6 @@ def save_sharded_optimizer(
             prefix (str): Perfix of file to save
             size_per_shard (int): Max file size of each file shard that store state tensors
         """
-        torch.cuda.empty_cache()
         assert isinstance(optimizer, OptimizerWrapper), "Please boost the optimizer before saving!"
         if os.path.isfile(checkpoint):
             logging.error(f"Provided path ({checkpoint}) should be a directory, not a file")
@@ -635,21 +393,30 @@ def save_sharded_optimizer(
 
         Path(checkpoint).mkdir(parents=True, exist_ok=True)
 
-        # Devices along the same dp_group share the same copies of states when zero is not used.
-        # In this case only let the device with dp_rank == 0 save the model.
-        if not self.use_zero and self.dp_rank != 0:
+        # If optim states are not sharded, other ranks don't need to participate in gather.
+        if not self.use_zero and self.moe_dp_rank != 0:
+            dist.barrier()
             return
 
         # Then collect the sharded states along dp_group(if using zero)/tp_group.
         # Only devices with (dp_rank == 0 and tp_rank == 0) are responsible for states saving.
-        state_dict_shard = self._optimizer_sharder(
+        state_dict_shard = MoECheckpointIO._optimizer_sharder(
             optimizer,
+            use_zero=self.use_zero,
+            global_dp_group=self.global_dp_group,
+            tp_group=self.tp_group,
+            moe_dp_group=self.moe_dp_group,
             size_per_shard=size_per_shard,
+            only_moe_param=self.ep_rank != 0,
         )
         states_name, save_index_file, param_group_file = get_optimizer_base_filenames(prefix)
         index_file = CheckpointIndexFile(checkpoint)
-        control_saving = self.dp_rank == 0 and self.tp_rank == 0
-        if self.pp_size == 1:
+        # e.g. dp_size = 4, moe_dp_size = 2, ep_size = 2 and use gather
+        # rank 0 saves moe & non-moe params; rank 1 only saves moe params
+        # rank 3 & 4 save nothing
+        control_saving = self.tp_rank == 0 and self.moe_dp_rank == 0
+
+        if self.pp_size == 1 and self.ep_size == 1:
             # When pipeline is not used, save the optimizer shards as in general checkpointIO
             total_size = save_state_dict_shards(
                 sharded_state_dict=state_dict_shard,
@@ -663,7 +430,11 @@ def save_sharded_optimizer(
                 # Store param groups.
                 index_file.append_meta_data("param_groups", param_group_file)
                 group_file_path = os.path.join(checkpoint, param_group_file)
-                save_param_groups(optimizer.param_info, group_file_path)
+                param_groups = [
+                    {**group, "params": group_info["params"]}
+                    for group, group_info in zip(optimizer.param_groups, optimizer.param_info["param_groups"])
+                ]
+                save_param_groups({"param_groups": param_groups}, group_file_path)
                 # Store index file.
                 index_file.append_meta_data("total_size", total_size)
                 index_file.write_index_file(save_index_file)
@@ -674,6 +445,7 @@ def save_sharded_optimizer(
                         f"index located at {save_index_file}."
                     )
 
+            dist.barrier()
         else:
             # When pipeline is used, each stage produces its own shard files and index files.
             # Index files belonging to each stage are saved under a temporary folder ./tmp_index_files/
@@ -684,8 +456,8 @@ def save_sharded_optimizer(
             Path(tmp_index_file_folder).mkdir(parents=True, exist_ok=True)
 
             # Manage filenames of sharded weights and index file for each pipeline stage.
-            states_name = states_name.replace(".bin", f"-stage-{self.pp_rank+1:05d}-shard.bin")
-            save_index_file = save_index_file.replace(".json", f"-stage-{self.pp_rank+1:05d}.json")
+            states_name = states_name.replace(".bin", f"-stage-{self.pp_rank+1:05d}-{self.ep_rank+1:05d}-shard.bin")
+            save_index_file = save_index_file.replace(".json", f"-stage-{self.pp_rank+1:05d}-{self.ep_rank+1:05d}.json")
             save_index_file = os.path.join("tmp_index_files", save_index_file)
 
             total_size = save_state_dict_shards(
@@ -698,18 +470,17 @@ def save_sharded_optimizer(
             )
 
             if control_saving:
-                assert (
-                    self.dp_rank == 0 and self.tp_rank == 0
-                ), "The saving process should have both dp_rank and tp_rank as 0."
                 index_file.append_meta_data("total_size", total_size)
                 index_file.write_index_file(save_index_file)
+                print(f"rank {dist.get_rank()} writing index file")
             else:
+                dist.barrier()
                 return
 
-            dist.barrier(self.pp_group)
+            dist.barrier()
 
             # The global master rank integrates the index files and clean the folder.
-            if self.pp_rank == 0:
+            if self.coordinator.is_master():
                 final_index_file = CheckpointIndexFile(checkpoint)
                 final_index_file.append_meta_data("total_size", 0)
 
@@ -722,7 +493,11 @@ def save_sharded_optimizer(
                 # Store param groups.
                 final_index_file.append_meta_data("param_groups", param_group_file)
                 group_file_path = os.path.join(checkpoint, param_group_file)
-                save_param_groups(optimizer.param_info, group_file_path)
+                param_groups = [
+                    {**group, "params": group_info["params"]}
+                    for group, group_info in zip(optimizer.param_groups, optimizer.param_info["param_groups"])
+                ]
+                save_param_groups({"param_groups": param_groups}, group_file_path)
 
                 final_index_file.write_index_file(final_index_file_path)
                 rmtree(tmp_index_file_folder)
@@ -733,8 +508,218 @@ def save_sharded_optimizer(
                         f"You can find where each parameters has been saved in the "
                         f"index located at {final_index_file_path}."
                     )
-        torch.cuda.empty_cache()
 
+    def load_sharded_optimizer(self, optimizer: OptimizerWrapper, checkpoint_index_file: str, prefix: str = ""):
+        """
+        Load sharded optimizer with the given path to index file of checkpoint folder.
+
+        Args:
+            optimizer (OptimizerWrapper): The optimizer to be loaded.
+            checkpoint_index_file (str): Path to the index file of checkpointing folder.
+            prefix (str): Not used.
+        """
+        assert isinstance(optimizer, OptimizerWrapper), "Please boost the optimizer before loading!"
+
+        def _get_param_id_from_optimizer_param(
+            param: torch.Tensor, master_to_working_map: Optional[Dict[int, torch.Tensor]] = None
+        ):
+            if master_to_working_map is not None:
+                working_param = master_to_working_map[id(param)]
+            else:
+                working_param = param
+            return optimizer.param_info["param2id"][id(working_param)]
+
+        # id_map is a mapping from param ids kept by current pipeline, to their corresponding parameter objects.
+        # When Zero is used, the mapped parameter objects should be fp32 master parameters.
+        # IDs should be obtained through saved param2id mapping earlier saved in optimizer.param_info.
+        id_map = {}
+        master_to_working_map = optimizer.get_master_to_working_map()
+        for pg in optimizer.optim.param_groups:
+            for param in pg["params"]:
+                param_id = _get_param_id_from_optimizer_param(param, master_to_working_map)
+                id_map[param_id] = param
+
+        # Read checkpoint index file.
+        ckpt_index_file = CheckpointIndexFile.from_file(checkpoint_index_file)
+        ckpt_root_path = ckpt_index_file.root_path
+        weight_map = ckpt_index_file.weight_map
+        weight_map = {int(k): v for k, v in weight_map.items()}  # convert saved id from str to int
+
+        # Load param_groups
+        param_group_path = ckpt_index_file.get_param_group_filename()
+        if param_group_path is None:
+            raise RuntimeError(
+                f"Invalid index file path {checkpoint_index_file} for an optimizer. \
+                               Lacking param group file under current directory."
+            )
+        saved_groups = torch.load(param_group_path)
+
+        updated_groups = []
+        for old_pg, saved_pg in zip(optimizer.optim.param_groups, saved_groups):
+            # obtain updated param group
+            new_pg = copy.deepcopy(saved_pg)
+            new_pg["params"] = old_pg["params"]  # The parameters in the same group shouln't change.
+            updated_groups.append(new_pg)
+        # ep param groups
+        if len(optimizer.optim.param_groups) == len(saved_groups) + 1:
+            new_pg = copy.deepcopy(saved_pg)
+            new_pg["params"] = optimizer.optim.param_groups[-1]["params"]
+            updated_groups.append(new_pg)
+        optimizer.optim.__dict__.update({"param_groups": updated_groups})
+
+        # Load saved states to optimizer.
+        # Keep a record of loaded files so that file will not be repeatedly loaded.
+        loaded_file = set()
+        for pg in optimizer.optim.param_groups:
+            for param in pg["params"]:
+                if param is None:
+                    continue
+                param_id = _get_param_id_from_optimizer_param(param, master_to_working_map)
+                if param_id not in weight_map:
+                    continue
+                filename = weight_map[param_id]
+
+                # If this param's states has been loaded before, directly return.
+                if filename in loaded_file:
+                    continue
+
+                file_path = os.path.join(ckpt_root_path, filename)
+                state_dict = load_shard_state_dict(Path(file_path), use_safetensors=False)
+                load_states_into_optimizer(optimizer.optim, state_dict, id_map, strict=True)
+                loaded_file.add(filename)
+
+        # Then shard the loaded optimizer states if using tp/zero.
+        for param, state in optimizer.optim.state.items():
+            device = param.device
+            if master_to_working_map is not None:
+                working_param = master_to_working_map[id(param)]
+            else:
+                working_param = param
+            original_shape = optimizer.param_info["param2shape"][id(working_param)]
+            sharded_state = self.shard_from_complete_optimizer_state(
+                state,
+                current_shape=working_param.shape,
+                original_shape=original_shape,
+                device=device,
+                inplace=True,
+                is_moe_param=is_moe_tensor(working_param),
+            )
+            optimizer.optim.state[param] = sharded_state
+
+        sharded_optimizer_loading_epilogue(optimizer.optim)
+        if self.verbose and self.coordinator.is_master():
+            logging.info(f"The optimizer has been successfully loaded from sharded checkpoint: {ckpt_root_path}.")
+
+    def shard_from_complete_optimizer_state(
+        self,
+        state: OrderedDict,
+        current_shape: torch.Size,
+        original_shape: torch.Size,
+        device: torch.device,
+        inplace: bool,
+        is_moe_param: bool,
+    ) -> OrderedDict:
+        """
+        With complete optimizer states of a specific parameter loaded from checkpoint,
+        slice out the sharded optimizer states kept by current device.
+
+        Args:
+            state (OrderedDict): Complete optimizer states of a given parameter, loaded from checkpoint.
+            current_shape (torch.Size): The size of parameter after sharding.
+            original_shape (torch.Size): The size of parameter before sharding.
+            device (torch.device): The destination device of loaded optimizer states.
+            inplace (bool): If set to True, will update the values of argument 'state' in place. Else will make a copy of state.
+
+        Returns:
+            OrderedDict: The sharded optimizer state of the given parameter.
+        """
+        state_ = state if inplace else copy.deepcopy(state)
+        for k, v in state_.items():
+            if isinstance(v, torch.Tensor) and k != "step":
+                # Shard state along tensor parallel group.
+                partition_dim = search_tp_partition_dim(current_shape, original_shape, self.tp_size)
+                if partition_dim is not None:
+                    slice_size = current_shape[partition_dim]
+                    v = v.split(slice_size, dim=partition_dim)[self.tp_rank]
+
+                # Shard state along data parallel group when using Zero.
+                if self.use_zero and not is_moe_param and self.global_dp_size > 1:
+                    padding_size = (self.global_dp_size - v.numel() % self.global_dp_size) % self.global_dp_size
+                    with torch.no_grad():
+                        v = v.flatten()
+                        if padding_size > 0:
+                            v = torch.nn.functional.pad(v, [0, padding_size])
+                        slice_size = v.numel() // self.global_dp_size
+                        v = v.split(slice_size, dim=0)[self.global_dp_rank]
+
+                elif self.use_zero and is_moe_param and self.moe_dp_size > 1:
+                    # LowLevelZeRO pads by global dp size for now.
+                    # TODO: update both to use moe dp size
+                    padding_size = (self.global_dp_size - v.numel() % self.global_dp_size) % self.global_dp_size
+                    with torch.no_grad():
+                        v = v.flatten()
+                        if padding_size > 0:
+                            v = torch.nn.functional.pad(v, [0, padding_size])
+                        slice_size = v.numel() // self.moe_dp_size
+                        v = v.split(slice_size, dim=0)[self.moe_dp_rank]
+
+                state_[k] = v.detach().clone().to(device)
+
+        return state_
+
+    """Migration from MoEHybridParallelCheckpointIO. These functions mostly deals with unsharded saving,
+    and can be savely deleted since large MoE models are often saved in shards.
+    """
+
+    # Copied from colossalai.moe
+    def pre_save_model(self, model: nn.Module) -> dict:
+        state_dict = model.state_dict()
+        for name, param in model.named_parameters():
+            if ".experts." in name and is_moe_tensor(param):
+                ep_group = param.ep_group
+                ep_rank = dist.get_rank(ep_group)
+                ep_size = dist.get_world_size(ep_group)
+                # TODO: check correctness here
+                # dp_rank = get_dp_rank(param)
+                dp_rank = dist.get_rank(self.global_dp_group)
+                if dp_rank == 0:
+                    param = param.data.cuda()
+                    if ep_rank == 0:
+                        all_param = [torch.zeros_like(param) for _ in range(ep_size)]
+                    else:
+                        all_param = None
+                    # gather param from every ep rank
+                    # dist.all_gather(all_param, param, group=ep_group)
+                    dist.gather(param, all_param, group=ep_group)
+                    if ep_rank == 0:
+                        all_param = torch.cat(all_param, dim=0)
+                        state_dict[name] = all_param.cpu()
+
+        if self.pp_size > 1:
+            if self.dp_rank == 0:
+                out = [None for _ in range(self.pp_size)]
+                dist.gather_object(state_dict, out, group=self.pp_group)
+                if self.pp_rank == 0:
+                    new_state_dict = {}
+                    for o in out:
+                        new_state_dict.update(o)
+                    state_dict = new_state_dict
+        dist.barrier()
+        return state_dict
+
+    def save_unsharded_model(
+        self,
+        model: nn.Module,
+        checkpoint: str,
+        gather_dtensor: bool,
+        use_safetensors: bool,
+    ):
+        state_dict = self.pre_save_model(model)
+        if dist.get_rank() == 0:
+            torch.save(state_dict, checkpoint)
+        dist.barrier()
+
+    # Copied from colossalai.moe
     def save_unsharded_optimizer(self, optimizer: OptimizerWrapper, checkpoint: str, gather_dtensor: bool):
         """
         Save optimizer state dict to a file with given path.
@@ -781,7 +766,8 @@ def save_unsharded_optimizer(self, optimizer: OptimizerWrapper, checkpoint: str,
             # When pipeline is used, first collect state_dict from every pipeline stage, then save the complete state_dict.
             states_list = [None for _ in range(self.pp_size)]
             dist.barrier(self.pp_group)
-            dist.all_gather_object(states_list, local_states, self.pp_group)
+            # dist.all_gather_object(states_list, local_states, self.pp_group)
+            dist.gather_object(local_states, states_list, self.pp_group)
 
             # Only the master rank do the saving.
             if self.coordinator.is_master():
@@ -790,3 +776,85 @@ def save_unsharded_optimizer(self, optimizer: OptimizerWrapper, checkpoint: str,
                     state_dict["state"].update(_states)
                 save_state_dict(state_dict, checkpoint, use_safetensors=False)
         dist.barrier()
+
+    # Copied from colossalai.moe
+    def load_unsharded_optimizer(self, optimizer: OptimizerWrapper, checkpoint: str, strict: bool = False):
+        """
+        Load optimizer from a file with given path.
+
+        Args:
+            optimizer (OptimizerWrapper): The optimizer to be loaded.
+            checkpoint_index_file (str): Path to the checkpoint file.
+        """
+
+        def _get_param_id_from_optimizer_param(
+            param: torch.Tensor, master_to_working_map: Optional[Dict[int, torch.Tensor]] = None
+        ):
+            if master_to_working_map is not None and id(param) in master_to_working_map:
+                working_param = master_to_working_map[id(param)]
+            else:
+                working_param = param
+            if id(working_param) in optimizer.param_info["param2id"]:
+                return optimizer.param_info["param2id"][id(working_param)]
+            else:
+                None
+
+        if self.coordinator.is_master():
+            logging.warning("Please avoid using unsharded checkpointing methods when dealing with large models!")
+
+        assert isinstance(optimizer, OptimizerWrapper), "Please boost the optimizer before loading!"
+
+        # Complete optimizer state_dict loaded from checkpoint, need to be processed later.
+        state_dict = load_state_dict(checkpoint)
+
+        # Load param_groups.
+        updated_groups = []
+        saved_groups = state_dict["param_groups"]
+        for old_pg, saved_pg in zip(optimizer.optim.param_groups, saved_groups):
+            new_pg = copy.deepcopy(saved_pg)
+            new_pg["params"] = old_pg["params"]  # Only keep the parameters kept by current pipeline stage.
+            updated_groups.append(new_pg)
+
+        # ep extra group
+        # if MOE_MANAGER.parallel == "EP":
+        if self.ep_size > 1:
+            new_pg = copy.deepcopy(saved_pg)
+            new_pg["params"] = optimizer.optim.param_groups[-1][
+                "params"
+            ]  # Only keep the parameters kept by current pipeline stage.
+            for param in new_pg["params"]:
+                param.data = param.data.to(torch.float32)
+            updated_groups.append(new_pg)
+        optimizer.optim.__dict__.update({"param_groups": updated_groups})
+
+        # Load saved states to optimizer. First discard those states not belonging to current pipeline stage.
+        master_to_working_map = optimizer.get_master_to_working_map()
+        id_map = {}
+        for pg in optimizer.optim.param_groups:
+            for param in pg["params"]:
+                param_id = _get_param_id_from_optimizer_param(param, master_to_working_map)
+                if param_id is not None:
+                    id_map[param_id] = param
+        load_states_into_optimizer(optimizer.optim, state_dict["state"], id_map, strict=True)
+
+        # Then shard the loaded optimizer states if using tp/zero.
+        for param, state in optimizer.optim.state.items():
+            if param is None:
+                continue
+            device = param.device
+            if master_to_working_map is not None and id(param) in master_to_working_map:
+                working_param = master_to_working_map[id(param)]
+            else:
+                working_param = param
+            original_shape = optimizer.param_info["param2shape"][id(working_param)]
+            sharded_state = self.pre_load_optim(
+                state,
+                param,
+                current_shape=working_param.shape,
+                original_shape=original_shape,
+                device=device,
+                inplace=True,
+            )
+            optimizer.optim.state[param] = sharded_state
+        sharded_optimizer_loading_epilogue(optimizer.optim)
+        dist.barrier()
diff --git a/colossalai/moe/load_balance.py b/colossalai/moe/load_balance.py
new file mode 100644
index 000000000000..85c12d73fa52
--- /dev/null
+++ b/colossalai/moe/load_balance.py
@@ -0,0 +1,442 @@
+from copy import deepcopy
+from typing import List, Optional, Tuple
+
+import torch
+import torch.distributed as dist
+from torch import Tensor, nn
+from torch.distributed import ProcessGroup
+
+from colossalai.cluster import ProcessGroupMesh
+from colossalai.moe.experts import MLPExperts
+from colossalai.moe.manager import MOE_MANAGER
+from colossalai.zero.low_level import LowLevelZeroOptimizer
+
+
+class LoadBalancer:
+    def __init__(
+        self,
+        experts: MLPExperts,
+        gate: nn.Parameter,
+        local_expert_num: int,
+        expert_num: int,
+        ep_group: ProcessGroup,
+        dp_group: ProcessGroup,
+        tolerance: Optional[float] = 0.1,
+        beam_width: Optional[int] = 8,
+        group_swap_factor: Optional[float] = 0.4,
+    ) -> None:
+        self.experts: MLPExperts = experts
+        self.gate: nn.Parameter = gate
+        self.moe_ep_group: ProcessGroup = ep_group
+        self.moe_ep_ranks = MOE_MANAGER.parallel_info_dict[dist.get_world_size(self.moe_ep_group)].ep_group_ranks
+        self.moe_dp_group: ProcessGroup = dp_group
+        self.tolerance = tolerance
+        self.beam_width = beam_width
+        self.group_swap_factor = group_swap_factor
+        self.local_expert_num = local_expert_num
+        self.expert_num = expert_num
+        self.local_load = None
+        # TODO: use a global process group mesh
+        pp_size = 1 if MOE_MANAGER.pp_size is None else MOE_MANAGER.pp_size
+        global_dp_group = ProcessGroupMesh(pp_size, dist.get_world_size() // pp_size)
+        self.global_dp_group = global_dp_group.get_group_along_axis(1)
+        self.global_dp_rank = dist.get_rank(self.global_dp_group)
+        self.global_dp_size = dist.get_world_size(self.global_dp_group)
+
+    def _clear_load(self) -> None:
+        self.local_load = None
+
+    def _sync_load(self) -> Tensor:
+        new_load = self.local_load.clone().detach()
+        # all reduce load between ep group
+        dist.all_reduce(new_load, group=self.moe_ep_group)
+        # all reduce load between dp group
+        dist.all_reduce(new_load, group=self.moe_dp_group)
+        return new_load
+
+    @staticmethod
+    def _get_diff_from_avg(data: List, group: int, avg: float) -> float:
+        return abs(sum(data[group]) / len(data[group]) - avg)
+
+    @staticmethod
+    def _swap_data(data: List, group_i: int, index_i: int, group_j: int, index_j: int) -> None:
+        data[group_i][index_i], data[group_j][index_j] = (
+            data[group_j][index_j],
+            data[group_i][index_i],
+        )
+
+    @staticmethod
+    def _normalize_data(data: List) -> List:
+        max_value = max(max(sublist) for sublist in data)
+        data = [[i / max_value for i in sublist] for sublist in data]
+        return data
+
+    @staticmethod
+    def _get_swap_loss(
+        group_swap_factor: float,
+        swap_list: List,
+        group_i: int,
+        index_i: int,
+        group_j: int,
+        index_j: int,
+    ) -> float:
+        """
+        Get swap loss. The swap loss is used to avoid the situation that
+        the same index is swapped twice and the same group is swapped for multiple times.
+        """
+        swap_loss = 0
+        for swap in swap_list:
+            for group_id, index_id in zip([group_i, group_j], [index_i, index_j]):
+                # the group has been swapped
+                if group_id in [swap[0], swap[2]]:
+                    # the index has been swapped
+                    # we want to avoid the situation that the same index is swapped twice
+                    if index_id in [swap[1], swap[3]]:
+                        swap_loss += 1e5
+                    # the index has not been swapped
+                    # this is acceptable but as less as possible
+                    else:
+                        swap_loss += group_swap_factor
+        return swap_loss
+
+    @staticmethod
+    def _check_convergence(data: List, avg: float, tolerance: float):
+        """
+        Check whether the data is converged after swap.
+        """
+        for sublist in data:
+            if abs(sum(sublist) / len(sublist) - avg) > tolerance * avg:
+                return False
+        return True
+
+    def _beam_search(
+        self,
+        inputs: Tuple[List, float, List],
+        beam_width: int,
+        avg: float,
+        group_swap_factor: float,
+    ) -> List:
+        """
+        Beam search for the best swap combination.
+        Specifically, we swap two elements from two groups and calculate the score.
+        The score is the difference between the origin group sum and the new group sum.
+        The larger the score, the better the swap combination.
+
+        Args:
+            inputs (Tuple): (data, origin_score, swap_list)
+            beam_width (int): beam width for beam search
+            avg (float): average value of the data
+            group_swap_factor (float): group loss for group swap loss
+
+        Returns:
+            List: results list
+        """
+        data, origin_score, swap_list = inputs
+        results = []
+        group_num = len(data)
+        group_size = len(data[0])
+        origin_diff_list = [self._get_diff_from_avg(data, i, avg) for i in range(group_num)]
+
+        for group_num_i in range(group_num):
+            for group_size_i in range(group_size):
+                for group_num_j in range(group_num_i + 1, group_num):
+                    for group_size_j in range(group_size):
+                        new_data = deepcopy(data)
+                        # calculate origin group sum
+                        origin_diff = origin_diff_list[group_num_i] + origin_diff_list[group_num_j]
+                        # swap data
+                        self._swap_data(
+                            new_data,
+                            group_num_i,
+                            group_size_i,
+                            group_num_j,
+                            group_size_j,
+                        )
+                        # calculate new group sum
+                        new_diff = self._get_diff_from_avg(new_data, group_num_i, avg) + self._get_diff_from_avg(
+                            new_data, group_num_j, avg
+                        )
+                        # caculate score
+                        new_score = origin_diff - new_diff
+                        if new_score > 0:
+                            new_score = origin_score + new_score
+                            # get swap loss
+                            swap_loss = self._get_swap_loss(
+                                group_swap_factor,
+                                swap_list,
+                                group_num_i,
+                                group_size_i,
+                                group_num_j,
+                                group_size_j,
+                            )
+                            new_score = new_score - swap_loss
+                            # update swap list
+                            new_swap_list = swap_list + [(group_num_i, group_size_i, group_num_j, group_size_j)]
+                            results.append((new_data, new_score, new_swap_list))
+        # sort results
+        results.sort(key=lambda x: x[1], reverse=True)
+        # select top k results
+        results = results[:beam_width]
+        return results
+
+    def _load_to_list(self, load: Tensor) -> List:
+        load_len = len(load)
+        assert load_len % self.local_expert_num == 0
+        load_list = []
+        tmp_list = []
+        for i in range(len(load)):
+            tmp_list.append(float(load[i]))
+            if (i + 1) % self.local_expert_num == 0:
+                load_list.append(tmp_list)
+                tmp_list = []
+        return load_list
+
+    def _search_balance(
+        self,
+        data: List,
+        tolerance: Optional[float] = 0.1,
+        beam_width: Optional[int] = 8,
+        group_swap_factor: Optional[float] = 0.4,
+        return_swapped_data: Optional[bool] = False,
+    ) -> Tuple[List, List]:
+        """
+        Search for the best swap combination to balance the data within the specified tolerance.
+        And return the balanced data and the swap list. The swap list is used to record the swap.
+        The swap list is a list of tuples. Each tuple is a swap operation.
+
+        Args:
+            data (List): expert load list.
+                E.g. [[9.2, 8.3], [2.3, 10.0], [6.1, 7.2], [5.3, 3.2]]
+                This means there are 4 devices and each devices has 2 experts.
+                The value is the load of the expert.
+            tolerance (float): tolerance for balance.
+            beam_width (int): beam width for beam search.
+            group_swap_factor (float): group swap factor for group swap loss.
+                The bigger it is, the less times a group will be swapped.
+            return_swapped_data (bool): whether to return the swapped data.
+
+        Returns:
+            Tuple: (balanced data, swap list).
+                The swap list is a list of tuples. Each tuple is a swap operation.
+                E.g. [(0, 0, 1, 0), (...), (...)]. The first tuple means
+                the first expert of the first device is swapped with the first expert
+                of the second device.
+        """
+        norm_data = self._normalize_data(data)
+        avg = sum(sum(sublist) / len(sublist) for sublist in norm_data) / len(norm_data)
+        results = [(norm_data, 0, [])]
+        stop_flag = False
+
+        while stop_flag == False:
+            new_results = []
+            best_score = results[0][1]
+            for i in range(len(results)):
+                new_results.extend(self._beam_search(results[i], beam_width, avg, group_swap_factor))
+            if len(new_results) == 0:
+                stop_flag = True
+                break
+            new_results.sort(key=lambda x: x[1], reverse=True)
+            new_best_score = new_results[0][1]
+            if new_best_score == best_score:
+                stop_flag = True
+                break
+            new_results = new_results[:beam_width]
+            results = new_results
+            for i in results:
+                if self._check_convergence(results[0][0], avg, tolerance):
+                    stop_flag = True
+                    break
+
+        swap_list = results[0][2]
+        if return_swapped_data:
+            out = deepcopy(data)
+            for swap in swap_list:
+                self._swap_data(out, *swap)
+            return out, swap_list
+        else:
+            return swap_list
+
+    @staticmethod
+    def _swap_expert_single_tensor(
+        weight: nn.Parameter,
+        expert_idx: int,
+        comm_group: ProcessGroup,
+        send_first: bool,
+        comm_rank: int,
+    ):
+        # exchange weight
+        local_weight = weight.data[expert_idx]
+        new_weight = torch.empty_like(local_weight)
+        if send_first:
+            dist.send(local_weight, dst=comm_rank, group=comm_group)
+            dist.recv(new_weight, src=comm_rank, group=comm_group)
+        else:
+            dist.recv(new_weight, src=comm_rank, group=comm_group)
+            dist.send(local_weight, dst=comm_rank, group=comm_group)
+        weight.data[expert_idx] = new_weight
+
+    def _swap_expert_param_and_optim(
+        self,
+        weight: nn.Parameter,
+        expert_idx: int,
+        comm_group: ProcessGroup,
+        send_first: bool,
+        comm_rank: int,
+        optim: LowLevelZeroOptimizer,
+    ):
+        # need to update master and working param if master param exists
+        # else just update working param
+        if weight in optim.optim.state:
+            master_weight_ptr = None
+            working_weight_ptr = weight
+            exp_avg_ptr = optim.optim.state[working_weight_ptr]["exp_avg"]
+            exp_avg_sq_ptr = optim.optim.state[working_weight_ptr]["exp_avg_sq"]
+        else:
+            master_weight_ptr = optim._param_store.working_to_master_param[id(weight)]
+            working_weight_ptr = weight
+            exp_avg_ptr = optim.optim.state[master_weight_ptr]["exp_avg"]
+            exp_avg_sq_ptr = optim.optim.state[master_weight_ptr]["exp_avg_sq"]
+
+        # exchange weight
+        self._swap_expert_single_tensor(
+            working_weight_ptr,
+            expert_idx,
+            comm_group,
+            send_first,
+            comm_rank,
+        )
+        if master_weight_ptr is not None:
+            # TODO: exchange master weight, skip for now
+            # master weight is shared by dp group
+            tmp = working_weight_ptr.view(-1).split(
+                working_weight_ptr.numel() // dist.get_world_size(self.moe_dp_group)
+            )[dist.get_rank(self.moe_dp_group)]
+            master_weight_ptr.data.copy_(tmp.clone().detach().to(master_weight_ptr.device).to(master_weight_ptr.dtype))
+        # exchange optim
+        self._swap_expert_single_tensor(exp_avg_ptr, expert_idx, comm_group, send_first, comm_rank)
+        self._swap_expert_single_tensor(exp_avg_sq_ptr, expert_idx, comm_group, send_first, comm_rank)
+
+    def _gather_global_dp_group(self, data: Tensor) -> Tensor:
+        data_list = [torch.zeros_like(data) for _ in range(self.global_dp_size)]
+        dist.all_gather(data_list, data, group=self.global_dp_group)
+        data_list = torch.cat(data_list, dim=0)
+        return data_list
+
+    def _swap_moe_param(self, swap_list: List, optim: LowLevelZeroOptimizer) -> None:
+        """
+        Swap moe param and optim.
+        We use different strategies to swap expert and gate.
+        For expert, we exchange the param and optim of the expert by p2p.
+        For gate, we all gather the gate choose the part we want.
+
+        Args:
+            swap_list (List)
+            optim (LowLevelZeroOptimizer)
+        """
+        # get all experts weights
+        local_rank = dist.get_rank(self.moe_ep_group)
+        if self.experts.gated:
+            weight_list = [self.experts.wi_up, self.experts.wi_gate]
+        else:
+            weight_list = [self.experts.wi]
+        weight_list.append(self.experts.wo)
+
+        # gate optim should be obtained first
+        gate_shape = self.gate.shape
+        # get master weight and optim
+        master_gate_weight = optim._param_store.working_to_master_param[id(self.gate)]
+        gate_exp_avg = optim.optim.state[master_gate_weight]["exp_avg"]
+        gate_exp_avg_sq = optim.optim.state[master_gate_weight]["exp_avg_sq"]
+        # gather
+        global_master_gate_weight = self._gather_global_dp_group(master_gate_weight).view(gate_shape)
+        global_gate_exp_avg = self._gather_global_dp_group(gate_exp_avg).view(gate_shape)
+        global_gate_exp_avg_sq = self._gather_global_dp_group(gate_exp_avg_sq).view(gate_shape)
+        assert (
+            self.gate.shape
+            == global_master_gate_weight.shape
+            == global_gate_exp_avg.shape
+            == global_gate_exp_avg_sq.shape
+        )
+
+        for swap in swap_list:
+            source_group, source_idx, target_group, target_idx = swap
+            source_rank = self.moe_ep_ranks[source_group]
+            target_rank = self.moe_ep_ranks[target_group]
+            # exchange expert
+            if local_rank in [source_group, target_group]:
+                for weight in weight_list:
+                    if local_rank == source_group:
+                        self._swap_expert_param_and_optim(
+                            weight,
+                            source_idx,
+                            self.moe_ep_group,
+                            True,
+                            target_rank,
+                            optim,
+                        )
+                    elif local_rank == target_group:
+                        self._swap_expert_param_and_optim(
+                            weight,
+                            target_idx,
+                            self.moe_ep_group,
+                            False,
+                            source_rank,
+                            optim,
+                        )
+            # exchange gate
+            source_expert_pos = source_group * self.local_expert_num + source_idx
+            target_expert_pos = target_group * self.local_expert_num + target_idx
+            for gate in [
+                self.gate,
+                global_master_gate_weight,
+                global_gate_exp_avg,
+                global_gate_exp_avg_sq,
+            ]:
+                origin_source = gate.data[source_expert_pos].clone().detach()
+                origin_target = gate.data[target_expert_pos].clone().detach()
+                gate.data[source_expert_pos], gate.data[target_expert_pos] = (
+                    origin_target,
+                    origin_source,
+                )
+
+        # update gate
+        global_master_gate_weight = global_master_gate_weight.view(-1).split(
+            global_master_gate_weight.numel() // self.global_dp_size
+        )[self.global_dp_rank]
+        master_gate_weight.data.copy_(global_master_gate_weight)
+        global_gate_exp_avg = global_gate_exp_avg.view(-1).split(global_gate_exp_avg.numel() // self.global_dp_size)[
+            self.global_dp_rank
+        ]
+        gate_exp_avg.data.copy_(global_gate_exp_avg)
+        global_gate_exp_avg_sq = global_gate_exp_avg_sq.view(-1).split(
+            global_gate_exp_avg_sq.numel() // self.global_dp_size
+        )[self.global_dp_rank]
+        gate_exp_avg_sq.data.copy_(global_gate_exp_avg_sq)
+
+    @torch.no_grad()
+    def update_load(self, load: Tensor) -> None:
+        if len(load) != self.expert_num:
+            padding_size = self.expert_num - len(load)
+            padding = torch.zeros(padding_size, dtype=load.dtype, device=load.device)
+            load = torch.cat((load, padding), dim=0)
+        if self.local_load is None:
+            self.local_load = load
+        else:
+            self.local_load += load
+
+    @torch.no_grad()
+    def balance_load(self, optim: LowLevelZeroOptimizer) -> None:
+        # prepare load
+        load = self._sync_load()
+        load = self._load_to_list(load)
+        # search balance
+        swap_list = self._search_balance(load)
+        if dist.get_rank() == 0:
+            if len(swap_list) > 0:
+                print(f"[Load Balance] Applying expert swap...")
+            else:
+                print(f"[Load Balance] Invalid swap, skip...")
+        # swap expert and gate
+        self._swap_moe_param(swap_list, optim)
+        # clear load
+        self._clear_load()
diff --git a/colossalai/moe/utils.py b/colossalai/moe/utils.py
index c642f1a4450f..3d08ab7dd9b0 100644
--- a/colossalai/moe/utils.py
+++ b/colossalai/moe/utils.py
@@ -6,10 +6,11 @@
 import torch.distributed as dist
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.distributed.distributed_c10d import get_process_group_ranks
 
 from colossalai.accelerator import get_accelerator
 from colossalai.moe.manager import MOE_MANAGER
-from colossalai.tensor.moe_tensor.api import get_dp_group, get_dp_group_ranks, get_ep_size, is_moe_tensor
+from colossalai.tensor.moe_tensor.api import is_moe_tensor
 
 
 class ForceFP32Parameter(torch.nn.Parameter):
@@ -145,7 +146,7 @@ def get_moe_epsize_param_dict(model: nn.Module) -> Dict[int, List[nn.Parameter]]
         if not is_moe_tensor(param):
             ep_size = 1  # set ep_size to 1 for dp parameters
         else:
-            ep_size = get_ep_size(param)
+            ep_size = dist.get_world_size(param.ep_group)
         if ep_size not in epsize_param_dict:
             epsize_param_dict[ep_size] = []
         epsize_param_dict[ep_size].append(param)
@@ -170,8 +171,8 @@ def sync_moe_model_param(model: nn.Module):
         # When ep_size = world_size, communication is not needed
         if ep_size != 1 and ep_size != MOE_MANAGER.world_size:
             for param in param_dict[ep_size]:
-                src_rank = get_dp_group_ranks(param)[0]
-                dist.broadcast(param, src=src_rank, group=get_dp_group(param))
+                src_rank = get_process_group_ranks(param.dp_group)[0]
+                dist.broadcast(param, src=src_rank, group=param.dp_group)
 
 
 def set_moe_args(config: Any, args: dict):
diff --git a/colossalai/shardformer/layer/moe/__init__.py b/colossalai/shardformer/layer/moe/__init__.py
new file mode 100644
index 000000000000..6fa015a94ca2
--- /dev/null
+++ b/colossalai/shardformer/layer/moe/__init__.py
@@ -0,0 +1,3 @@
+from .experts import *
+from .layers import *
+from .routers import *
diff --git a/colossalai/shardformer/layer/moe/experts.py b/colossalai/shardformer/layer/moe/experts.py
new file mode 100644
index 000000000000..373315fb933c
--- /dev/null
+++ b/colossalai/shardformer/layer/moe/experts.py
@@ -0,0 +1,161 @@
+import math
+from typing import Callable, Optional, Tuple
+
+import torch
+import torch.nn as nn
+
+from colossalai.kernel.triton.llama_act_combine_kernel import HAS_TRITON
+from colossalai.moe._operation import MoeInGradScaler, MoeOutGradScaler
+from colossalai.moe.manager import MOE_MANAGER
+from colossalai.moe.utils import get_activation
+from colossalai.shardformer.layer.utils import Randomizer
+from colossalai.tensor.moe_tensor.api import get_ep_rank, get_ep_size, set_moe_tensor_info
+
+if HAS_TRITON:
+    from colossalai.kernel.triton.llama_act_combine_kernel import LlamaActCombine
+
+
+class MLPExperts(nn.Module):
+    """
+    SparseMLP is a multi-layer perceptron with sparse expert parallel layers.
+
+    Args:
+        num_experts (int): The number of experts
+        hidden_size (int): The hidden size of MLP
+        intermediate_size (int): The intermediate size of MLP
+        expert_parallel (str, optional): The parallelism of experts. Now we have None, EP and TP.
+        activation (optional): The activation function of MLP
+        drop_rate (float, optional): The drop rate of MLP
+        gated (bool, optional): Whether to use gated MLP
+        use_kernel (bool, optional): Whether to use kernel optimization
+    """
+
+    def __init__(
+        self,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size: int,
+        expert_parallel: Optional[str] = "EP",
+        activation: Optional[Callable] = None,
+        drop_rate: Optional[float] = 0,
+        gated: Optional[bool] = False,
+        use_kernel: Optional[bool] = False,
+    ):
+        super().__init__()
+        assert expert_parallel in ["EP", "TP", None]
+        self.expert_parallel = expert_parallel
+        self.num_total_experts = num_experts
+        self.gated = gated
+        self.use_kernel = use_kernel
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+
+        # get expert parallel info
+        if expert_parallel is not None:
+            self.num_local_experts, self.moe_info = MOE_MANAGER.get_info(
+                num_experts, use_tp=True if expert_parallel == "TP" else False
+            )
+            # get settings for different parallel
+            self.ep_size = get_ep_size(self)
+            if expert_parallel == "TP":
+                intermediate_size = intermediate_size // self.ep_size
+                num_experts = self.num_total_experts
+            else:
+                num_experts = self.num_local_experts
+        else:
+            self.num_local_experts = self.num_total_experts
+            self.ep_size = 1
+
+        if gated:
+            self.wi_gate = nn.Parameter(
+                torch.empty(
+                    num_experts, hidden_size, intermediate_size * 2 if activation == "swiglu" else intermediate_size
+                )
+            )
+            self.wi_up = nn.Parameter(torch.empty(num_experts, hidden_size, intermediate_size))
+        else:
+            self.wi = nn.Parameter(torch.empty(num_experts, hidden_size, intermediate_size))
+        self.wo = nn.Parameter(torch.empty(num_experts, intermediate_size, hidden_size))
+
+        self.act_name = activation
+        self.act = get_activation(activation)
+        self.drop = nn.Dropout(p=drop_rate)
+
+        if expert_parallel is not None:
+            for param in self.parameters():
+                set_moe_tensor_info(param, self.moe_info)
+
+        # init param
+        self.reset_parameters()
+
+    @torch.no_grad()
+    def reset_parameters(self):
+        # expert param should be different
+        if self.expert_parallel is not None:
+            seed_ctx = Randomizer(get_ep_rank(self)).fork_rng(enable_cpu=True)
+        else:
+            seed_ctx = Randomizer(42).fork_rng(enable_cpu=True)
+        with seed_ctx:
+            if self.gated:
+                torch.nn.init.normal_(self.wi_gate, std=math.sqrt(0.1 / self.hidden_size))
+                torch.nn.init.normal_(self.wi_up, std=math.sqrt(0.1 / self.hidden_size))
+            else:
+                torch.nn.init.normal_(self.wi, std=math.sqrt(0.1 / self.hidden_size))
+            torch.nn.init.normal_(self.wo, std=math.sqrt(0.1 / self.intermediate_size))
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        param_slice: Tuple[slice] = (slice(None),),
+        use_sparse: bool = True,
+    ) -> torch.Tensor:
+        """
+        forward: hidden_size --> intermediate_size --> hidden_size
+
+        Args:
+            x (torch.Tensor): The input tensor of shape (num_groups, num_experts, capacity, hidden_size)
+
+        Returns:
+            torch.Tensor: The output tensor of shape (num_groups, num_experts, capacity, hidden_size)
+        """
+        x = MoeInGradScaler.apply(x, self.ep_size)
+
+        e = x.size(1)
+        h = x.size(-1)
+
+        x = x.transpose(0, 1)
+        inshape = x.shape
+        x = x.reshape(e, -1, h)
+
+        if self.use_kernel and use_sparse:
+            seq_len = x.shape[1]
+            with torch.no_grad():
+                mask = x[:, :, 0] != 0.0
+                mask = torch.sum(mask, dim=-1)
+            x_list = []
+            for i in range(e):
+                x_list.append(x[i, : mask[i]])
+            x = x_list
+
+        if self.gated:
+            x_gate = [torch.mm(x[i], self.wi_gate[param_slice][i]) for i in range(e)]
+            x_up = [torch.mm(x[i], self.wi_up[param_slice][i]) for i in range(e)]
+            if self.use_kernel and HAS_TRITON and self.act_name == "swiglu":
+                x = [LlamaActCombine.apply(x_gate[i], x_up[i]) for i in range(e)]
+            else:
+                x = [self.act(x_gate[i]) * x_up[i] for i in range(e)]
+        else:
+            x = [torch.mm(x[i], self.wi[param_slice][i]) for i in range(e)]
+            x = [self.act(x[i]) for i in range(e)]
+        x = [self.drop(x[i]) for i in range(e)]
+        x = [torch.mm(x[i], self.wo[param_slice][i]) for i in range(e)]
+
+        if self.use_kernel and use_sparse:
+            for i in range(e):
+                x[i] = torch.nn.functional.pad(x[i], (0, 0, 0, seq_len - x[i].shape[0]), mode="constant", value=0)
+
+        x = torch.cat([x[i].unsqueeze(0) for i in range(e)], dim=0)
+        x = x.reshape(inshape)
+        x = x.transpose(0, 1).contiguous()
+        x = MoeOutGradScaler.apply(x, self.ep_size)
+        return x
diff --git a/colossalai/shardformer/layer/moe/layers.py b/colossalai/shardformer/layer/moe/layers.py
new file mode 100644
index 000000000000..e1f7a240d0e3
--- /dev/null
+++ b/colossalai/shardformer/layer/moe/layers.py
@@ -0,0 +1,404 @@
+import dataclasses
+import math
+from typing import Any, Optional, Tuple
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+
+from colossalai.moe._operation import AllGather, AllToAll, HierarchicalAllToAll, MoeCombine, MoeDispatch, ReduceScatter
+from colossalai.moe.load_balance import LoadBalancer
+from colossalai.moe.utils import create_ep_hierarchical_group, get_noise_generator
+from colossalai.shardformer.layer.moe import MLPExperts
+from colossalai.shardformer.layer.moe.routers import MoeRouter, get_router_cls
+from colossalai.tensor.moe_tensor.api import get_dp_group, get_ep_group, get_ep_group_ranks, get_ep_size
+
+
+class SparseMLP(nn.Module):
+    """A class for users to create MoE modules in their models.
+
+    Args:
+        dim_model (int): Hidden dimension of training model
+        num_experts (int): The number experts
+        top_k (int, optional): The number of experts for dispatchment of each token
+        parallel (str): parallel mode. Should be "EP", "TP" or None
+        capacity_factor_train (float, optional): Capacity factor in routing during training
+        capacity_factor_eval (float, optional): Capacity factor in routing during evaluation
+        min_capacity (int, optional): The minimum number of the capacity of each expert
+        noisy_policy (str, optional): The policy of noisy function. Now we have 'Jitter' and 'Gaussian'.
+            'Jitter' can be found in `Switch Transformer paper`_.
+            'Gaussian' can be found in `ViT-MoE paper`_.
+        drop_tks (bool, optional): Whether drops tokens in evaluation
+        use_residual (bool, optional): Makes this MoE layer a Residual MoE.
+            More information can be found in `Microsoft paper`_.
+        residual_instance (nn.Module, optional): The instance of residual module in Residual MoE
+        expert_instance (MoeExperts, optional): The instance of experts module in MoeLayer
+        expert_cls (Type[nn.Module], optional): The class of each expert when no instance is given
+        expert_args (optional): The args of expert when no instance is given
+
+    .. _Switch Transformer paper:
+        https://arxiv.org/abs/2101.03961
+    .. _ViT-MoE paper:
+        https://arxiv.org/abs/2106.05974
+    .. _Microsoft paper:
+        https://arxiv.org/abs/2201.05596
+    """
+
+    def __init__(
+        self,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size: int,
+        router_top_k: int = 1,
+        parallel: str = "EP",
+        router_loss: bool = True,
+        router_norm: bool = False,
+        router_capacity_factor_train: float = 1.25,
+        router_capacity_factor_eval: float = 2.0,
+        router_min_capacity: int = 4,
+        router_noisy_policy: Optional[str] = None,
+        router_drop_tks: bool = True,
+        mlp_activation: Optional[str] = None,
+        mlp_gated: bool = False,
+        enable_load_balance: bool = False,
+        load_balance_tolerance: float = 0.1,
+        load_balance_beam_width: int = 8,
+        load_balance_group_swap_factor: float = 0.4,
+        enable_kernel: bool = False,
+        enable_comm_overlap: bool = False,
+        enable_hierarchical_comm: bool = True,
+        return_gate_logits: bool = False,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_experts = num_experts
+        self.gated = mlp_gated
+        self.return_gate_logits = return_gate_logits
+        self.enable_kernel = enable_kernel
+        self.enable_comm_overlap = enable_comm_overlap
+        # self.expert_parallel = MOE_MANAGER.get_parallel()
+        assert parallel in ["EP", "TP", None], "parallel mode must be EP, TP or None"
+        self.parallel = parallel
+        self.router_loss = router_loss
+        self.router_norm = router_norm
+
+        # moe router
+        noisy_func = get_noise_generator(router_noisy_policy, num_experts)
+        router_cls = get_router_cls(router_top_k)
+        self.topk = router_top_k
+        self.router: MoeRouter = router_cls(
+            capacity_factor_train=router_capacity_factor_train,
+            capacity_factor_eval=router_capacity_factor_eval,
+            min_capacity=router_min_capacity,
+            noisy_func=noisy_func,
+            drop_tks=router_drop_tks,
+        )
+
+        # gate
+        self.gate_weight = torch.nn.Parameter(torch.empty(num_experts, self.hidden_size))
+
+        # moe experts
+        self.experts = MLPExperts(
+            num_experts=self.num_experts,
+            expert_parallel=self.parallel,
+            hidden_size=self.hidden_size,
+            intermediate_size=self.intermediate_size,
+            activation=mlp_activation,
+            gated=mlp_gated,
+            use_kernel=self.enable_kernel,
+        )
+
+        # get parallel settings
+        if self.parallel is not None:
+            self.ep_group = get_ep_group(self.experts)
+            self.ep_size = get_ep_size(self.experts)
+            self.ep_hierarchical_group = None
+            if enable_hierarchical_comm:
+                # TODO: move to plugin
+                self.ep_intra_src_rank, *self.ep_hierarchical_group = create_ep_hierarchical_group(
+                    get_ep_group_ranks(self.experts)
+                )
+            self.dp_group = get_dp_group(self.experts)
+        else:
+            self.ep_group = None
+            self.dp_group = None
+        self.num_local_experts = self.experts.num_local_experts
+
+        # load balance
+        self.enable_load_balance = enable_load_balance
+        if self.enable_load_balance == True:
+            self.load_balancer = LoadBalancer(
+                experts=self.experts,
+                gate=self.gate_weight,
+                local_expert_num=self.num_local_experts,
+                expert_num=self.num_experts,
+                ep_group=self.ep_group,
+                dp_group=self.dp_group,
+                tolerance=load_balance_tolerance,
+                beam_width=load_balance_beam_width,
+                group_swap_factor=load_balance_group_swap_factor,
+            )
+
+        # init param
+        self.reset_parameters()
+
+    @torch.no_grad()
+    def reset_parameters(self):
+        torch.nn.init.normal_(self.gate_weight, std=math.sqrt(0.1 / self.hidden_size))
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            inputs (torch.Tensor): The input tensor of shape (batch_size, seq_len, hidden_size)
+
+        Returns:
+            torch.Tensor: The output tensor of shape (batch_size, seq_len, hidden_size)
+        """
+        # reshape the input tokens
+        tokens = inputs.reshape(-1, self.hidden_size)
+
+        # the data type of the inputs in the gating should be fp32
+        gate_logits = F.linear(tokens, self.gate_weight)
+        gate_output = gate_logits.to(torch.float)
+
+        # update expert load
+        if self.enable_load_balance == True:
+            with torch.no_grad():
+                # TODO: optimize computation
+                expert_load = torch.topk(gate_output, k=self.topk, dim=-1)[1]
+                # TODO: bincount introduces synchronize, fix it
+                expert_load = torch.bincount(expert_load.view(-1))
+                self.load_balancer.update_load(expert_load)
+
+        # the result from the router
+        used_capacity, *route_result_list = self.router(
+            inputs=gate_output,
+            use_kernel=self.enable_kernel,
+            ep_group=self.ep_group,
+            use_loss=self.router_loss,
+            use_norm=self.router_norm,
+        )
+
+        # dispatch_data: (num_experts, capacity, hidden_size)
+        if self.enable_kernel:
+            dispatch_data = MoeDispatch.apply(tokens, *route_result_list[1:])
+            dispatch_data = dispatch_data.reshape(self.num_experts, -1, self.hidden_size)
+        else:
+            sec_mask_f = route_result_list[1].type_as(inputs)
+            dispatch_data = torch.matmul(sec_mask_f.permute(1, 2, 0), tokens)
+
+        # expert_output: (num_groups, num_experts, capacity, hidden_size)
+        if self.parallel == "EP":
+            expert_output = self._ep_process(dispatch_data, used_capacity, overlap=self.enable_comm_overlap)
+        elif self.parallel == "TP":
+            expert_output = self._tp_process(dispatch_data, used_capacity, overlap=self.enable_comm_overlap)
+        elif self.parallel is None:
+            expert_output = self._local_process(dispatch_data)
+        else:
+            raise NotImplementedError(
+                "This kind of communication has not been implemented yet.\n" "Please use Experts build function."
+            )
+
+        if self.enable_kernel:
+            expert_output = expert_output.reshape(-1, self.hidden_size)
+            ans = MoeCombine.apply(expert_output, *route_result_list)
+        else:
+            combine_weights = route_result_list[0].type_as(inputs)
+            combine_weights = combine_weights.view(combine_weights.shape[0], -1)
+            expert_output = expert_output.view(-1, expert_output.shape[-1])
+            ans = torch.matmul(combine_weights, expert_output)
+
+        ans = ans.reshape(inputs.shape)
+
+        if self.return_gate_logits:
+            return ans, gate_logits
+        else:
+            return ans
+
+    def _local_process(self, expert_in: torch.Tensor) -> torch.Tensor:
+        expert_in = expert_in.unsqueeze(0)
+        expert_out = self.experts(expert_in)
+        return expert_out
+
+    def _ep_process(
+        self, dispatch_data: torch.Tensor, used_capacity: torch.Tensor, overlap: bool = False
+    ) -> torch.Tensor:
+        """
+        Expert Parallel
+
+        Args:
+            dispatch_data (torch.Tensor): (num_experts, capacity, hidden_size)
+
+        Returns:
+            torch.Tensor: (num_experts, capacity, hidden_size)
+        """
+        if not overlap or dist.get_world_size(self.ep_group) == 1:
+            if self.ep_hierarchical_group is not None:
+                expert_input = HierarchicalAllToAll.apply(
+                    dispatch_data, self.ep_hierarchical_group, self.ep_intra_src_rank
+                )
+                expert_input = expert_input.reshape(self.ep_size, self.num_local_experts, -1, self.hidden_size)
+                expert_output = self.experts(expert_input)
+                expert_output = HierarchicalAllToAll.apply(
+                    expert_output, self.ep_hierarchical_group, self.ep_intra_src_rank
+                )
+                return expert_output
+            else:
+                expert_input = AllToAll.apply(dispatch_data, self.ep_group, False)[0]
+                expert_input = expert_input.reshape(self.ep_size, self.num_local_experts, -1, self.hidden_size)
+                expert_output = self.experts(expert_input)
+                expert_output = AllToAll.apply(expert_output, self.ep_group, False)[0]
+                return expert_output
+        else:
+
+            @dataclasses.dataclass
+            class Capsule:
+                data: torch.Tensor
+                handle: Any = None
+
+            NUM_CHUNK = 4
+            NUM_STAGES = 4
+
+            assert dispatch_data.shape[1] % NUM_CHUNK == 0, "arbitrary chunk num is not supported yet"
+            chunk_size = dispatch_data.shape[1] // NUM_CHUNK
+            input_shape = (self.ep_size, self.num_local_experts, -1, self.hidden_size)
+            dispatch_data = dispatch_data.reshape(*input_shape)
+            chunk_data = torch.split(dispatch_data, chunk_size, dim=2)
+            output = torch.empty_like(dispatch_data)
+
+            offset = 0
+            _expert_in, expert_in, _expert_out, expert_out = None, None, None, None
+
+            for i in range(NUM_CHUNK + NUM_STAGES - 1):
+                if expert_out is not None:
+                    expert_out.handle.wait()
+                    output[:, :, offset : offset + chunk_size, :] = expert_out.data
+                    offset += chunk_size
+                    expert_out = None
+
+                # all2all last output
+                if _expert_out is not None:
+                    expert_out = Capsule(
+                        *AllToAll.apply(_expert_out.data, self.ep_group, True),
+                    )
+                    _expert_out = None
+
+                # all2all next input
+                if 0 <= i < NUM_CHUNK:
+                    _expert_in = Capsule(*AllToAll.apply(chunk_data[i].contiguous(), self.ep_group, True))
+
+                # compute
+                if expert_in is not None:
+                    expert_in.handle.wait()
+                    _expert_out = Capsule(data=self.experts(expert_in.data), handle=None)
+                    expert_in = None
+
+                if _expert_in is not None:
+                    expert_in = _expert_in
+                    _expert_in = None
+
+            return output
+
+    def _tp_process(
+        self, dispatch_data: torch.Tensor, used_capacity: torch.Tensor, overlap: bool = False
+    ) -> torch.Tensor:
+        """
+        without overlap:
+                   |    C    |
+        |     A    |         |    R    |
+
+        with overlap:
+              |    C1   ||    C2   ||    C3   ||    C4   |
+        | A1 || A2 |     | R1 | A3 || R2 | A4 || R3 |     | R4 |
+
+        where C is computation, A is all gather, R is reduce scatter.
+
+        Args:
+            dispatch_data (torch.Tensor): (num_experts, capacity, hidden_size)
+
+        Returns:
+            torch.Tensor: (num_experts, capacity, hidden_size)
+        """
+        if not overlap or dist.get_world_size(self.ep_group) == 1:
+            expert_in = AllGather.apply(dispatch_data, self.ep_group, False)[0]
+            expert_out = self.experts(expert_in)
+            expert_out = ReduceScatter.apply(expert_out, self.ep_group, False)[0]
+            return expert_out
+        else:
+
+            @dataclasses.dataclass
+            class Capsule:
+                data: torch.Tensor
+                handle: Any
+                indices: Tuple
+
+            NUM_CHUNK = 4
+            NUM_STAGES = 4
+
+            assert (
+                dispatch_data.shape[0] % NUM_CHUNK == 0
+            ), "arbitrary chunk num is not supported yet, please use chunk num that can divide num_experts"
+            chunk_size = dispatch_data.shape[0] // NUM_CHUNK
+            chunk_data = torch.split(dispatch_data, chunk_size, dim=0)
+            output = torch.empty_like(dispatch_data)
+
+            def get_chunk_slice(idx: int, chunk_size: int) -> Tuple[slice]:
+                return (slice(idx * chunk_size, (idx + 1) * chunk_size),)
+
+            _expert_in, expert_in, _expert_out, expert_out = None, None, None, None
+
+            for i in range(NUM_CHUNK + NUM_STAGES - 1):
+                if expert_out is not None:
+                    expert_out.handle.wait()
+                    output[expert_out.indices] = expert_out.data
+                    expert_out = None
+
+                # reduce scatter last output
+                if _expert_out is not None:
+                    expert_out = Capsule(
+                        *ReduceScatter.apply(_expert_out.data, self.ep_group, True),
+                        indices=_expert_out.indices,
+                    )
+                    _expert_out = None
+
+                # all gather next input
+                if 0 <= i < NUM_CHUNK:
+                    _expert_in = Capsule(
+                        *AllGather.apply(chunk_data[i].contiguous(), self.ep_group, True),
+                        indices=get_chunk_slice(i, chunk_size),
+                    )
+
+                # compute
+                if expert_in is not None:
+                    expert_in.handle.wait()
+                    _expert_out = Capsule(
+                        self.experts(expert_in.data, expert_in.indices),
+                        handle=None,
+                        indices=expert_in.indices,
+                    )
+                    expert_in = None
+
+                if _expert_in is not None:
+                    expert_in = _expert_in
+                    _expert_in = None
+
+            return output
+
+
+def apply_load_balance(model: nn.Module, optim: Any) -> None:
+    """
+    apply load balance to every experts in the model
+    """
+
+    def _apply_recursive(module: nn.Module):
+        for _, sub_module in module.named_children():
+            if isinstance(sub_module, SparseMLP):
+                if sub_module.enable_load_balance == True:
+                    sub_module.load_balancer.balance_load(optim)
+            _apply_recursive(sub_module)
+
+    torch.cuda.empty_cache()
+    _apply_recursive(model)
+    torch.cuda.empty_cache()
diff --git a/colossalai/shardformer/layer/moe/routers.py b/colossalai/shardformer/layer/moe/routers.py
new file mode 100644
index 000000000000..373315fb933c
--- /dev/null
+++ b/colossalai/shardformer/layer/moe/routers.py
@@ -0,0 +1,161 @@
+import math
+from typing import Callable, Optional, Tuple
+
+import torch
+import torch.nn as nn
+
+from colossalai.kernel.triton.llama_act_combine_kernel import HAS_TRITON
+from colossalai.moe._operation import MoeInGradScaler, MoeOutGradScaler
+from colossalai.moe.manager import MOE_MANAGER
+from colossalai.moe.utils import get_activation
+from colossalai.shardformer.layer.utils import Randomizer
+from colossalai.tensor.moe_tensor.api import get_ep_rank, get_ep_size, set_moe_tensor_info
+
+if HAS_TRITON:
+    from colossalai.kernel.triton.llama_act_combine_kernel import LlamaActCombine
+
+
+class MLPExperts(nn.Module):
+    """
+    SparseMLP is a multi-layer perceptron with sparse expert parallel layers.
+
+    Args:
+        num_experts (int): The number of experts
+        hidden_size (int): The hidden size of MLP
+        intermediate_size (int): The intermediate size of MLP
+        expert_parallel (str, optional): The parallelism of experts. Now we have None, EP and TP.
+        activation (optional): The activation function of MLP
+        drop_rate (float, optional): The drop rate of MLP
+        gated (bool, optional): Whether to use gated MLP
+        use_kernel (bool, optional): Whether to use kernel optimization
+    """
+
+    def __init__(
+        self,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size: int,
+        expert_parallel: Optional[str] = "EP",
+        activation: Optional[Callable] = None,
+        drop_rate: Optional[float] = 0,
+        gated: Optional[bool] = False,
+        use_kernel: Optional[bool] = False,
+    ):
+        super().__init__()
+        assert expert_parallel in ["EP", "TP", None]
+        self.expert_parallel = expert_parallel
+        self.num_total_experts = num_experts
+        self.gated = gated
+        self.use_kernel = use_kernel
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+
+        # get expert parallel info
+        if expert_parallel is not None:
+            self.num_local_experts, self.moe_info = MOE_MANAGER.get_info(
+                num_experts, use_tp=True if expert_parallel == "TP" else False
+            )
+            # get settings for different parallel
+            self.ep_size = get_ep_size(self)
+            if expert_parallel == "TP":
+                intermediate_size = intermediate_size // self.ep_size
+                num_experts = self.num_total_experts
+            else:
+                num_experts = self.num_local_experts
+        else:
+            self.num_local_experts = self.num_total_experts
+            self.ep_size = 1
+
+        if gated:
+            self.wi_gate = nn.Parameter(
+                torch.empty(
+                    num_experts, hidden_size, intermediate_size * 2 if activation == "swiglu" else intermediate_size
+                )
+            )
+            self.wi_up = nn.Parameter(torch.empty(num_experts, hidden_size, intermediate_size))
+        else:
+            self.wi = nn.Parameter(torch.empty(num_experts, hidden_size, intermediate_size))
+        self.wo = nn.Parameter(torch.empty(num_experts, intermediate_size, hidden_size))
+
+        self.act_name = activation
+        self.act = get_activation(activation)
+        self.drop = nn.Dropout(p=drop_rate)
+
+        if expert_parallel is not None:
+            for param in self.parameters():
+                set_moe_tensor_info(param, self.moe_info)
+
+        # init param
+        self.reset_parameters()
+
+    @torch.no_grad()
+    def reset_parameters(self):
+        # expert param should be different
+        if self.expert_parallel is not None:
+            seed_ctx = Randomizer(get_ep_rank(self)).fork_rng(enable_cpu=True)
+        else:
+            seed_ctx = Randomizer(42).fork_rng(enable_cpu=True)
+        with seed_ctx:
+            if self.gated:
+                torch.nn.init.normal_(self.wi_gate, std=math.sqrt(0.1 / self.hidden_size))
+                torch.nn.init.normal_(self.wi_up, std=math.sqrt(0.1 / self.hidden_size))
+            else:
+                torch.nn.init.normal_(self.wi, std=math.sqrt(0.1 / self.hidden_size))
+            torch.nn.init.normal_(self.wo, std=math.sqrt(0.1 / self.intermediate_size))
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        param_slice: Tuple[slice] = (slice(None),),
+        use_sparse: bool = True,
+    ) -> torch.Tensor:
+        """
+        forward: hidden_size --> intermediate_size --> hidden_size
+
+        Args:
+            x (torch.Tensor): The input tensor of shape (num_groups, num_experts, capacity, hidden_size)
+
+        Returns:
+            torch.Tensor: The output tensor of shape (num_groups, num_experts, capacity, hidden_size)
+        """
+        x = MoeInGradScaler.apply(x, self.ep_size)
+
+        e = x.size(1)
+        h = x.size(-1)
+
+        x = x.transpose(0, 1)
+        inshape = x.shape
+        x = x.reshape(e, -1, h)
+
+        if self.use_kernel and use_sparse:
+            seq_len = x.shape[1]
+            with torch.no_grad():
+                mask = x[:, :, 0] != 0.0
+                mask = torch.sum(mask, dim=-1)
+            x_list = []
+            for i in range(e):
+                x_list.append(x[i, : mask[i]])
+            x = x_list
+
+        if self.gated:
+            x_gate = [torch.mm(x[i], self.wi_gate[param_slice][i]) for i in range(e)]
+            x_up = [torch.mm(x[i], self.wi_up[param_slice][i]) for i in range(e)]
+            if self.use_kernel and HAS_TRITON and self.act_name == "swiglu":
+                x = [LlamaActCombine.apply(x_gate[i], x_up[i]) for i in range(e)]
+            else:
+                x = [self.act(x_gate[i]) * x_up[i] for i in range(e)]
+        else:
+            x = [torch.mm(x[i], self.wi[param_slice][i]) for i in range(e)]
+            x = [self.act(x[i]) for i in range(e)]
+        x = [self.drop(x[i]) for i in range(e)]
+        x = [torch.mm(x[i], self.wo[param_slice][i]) for i in range(e)]
+
+        if self.use_kernel and use_sparse:
+            for i in range(e):
+                x[i] = torch.nn.functional.pad(x[i], (0, 0, 0, seq_len - x[i].shape[0]), mode="constant", value=0)
+
+        x = torch.cat([x[i].unsqueeze(0) for i in range(e)], dim=0)
+        x = x.reshape(inshape)
+        x = x.transpose(0, 1).contiguous()
+        x = MoeOutGradScaler.apply(x, self.ep_size)
+        return x
diff --git a/colossalai/shardformer/modeling/mixtral.py b/colossalai/shardformer/modeling/mixtral.py
index a2b78a2bd18c..8be5b7294f66 100644
--- a/colossalai/shardformer/modeling/mixtral.py
+++ b/colossalai/shardformer/modeling/mixtral.py
@@ -1,23 +1,23 @@
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
+
+# from colossalai.tensor.moe_tensor.moe_info import MoeParallelInfo
+from torch.distributed import ProcessGroup
 from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
 
 from colossalai.lazy import LazyInitContext
-from colossalai.moe import MOE_MANAGER
 from colossalai.moe._operation import MoeInGradScaler, MoeOutGradScaler, all_to_all_uneven
 from colossalai.shardformer.shard.utils import set_tensors_to_none
-from colossalai.tensor.moe_tensor.api import set_moe_tensor_info
 
 
 class EPMixtralSparseMoeBlock(MixtralSparseMoeBlock):
     def __init__(self, config):
+        self.moe_info = None
         super().__init__(config)
-        self.setup_ep()
 
-    def setup_ep(self):
-        _, moe_info = MOE_MANAGER.get_info(self.num_experts)
-        ep_group = moe_info.ep_group
+    def setup_ep(self, ep_group: ProcessGroup):
+        ep_group = ep_group
         self.ep_size = dist.get_world_size(ep_group) if ep_group is not None else 1
         self.ep_rank = dist.get_rank(ep_group) if ep_group is not None else 0
         assert self.num_experts % self.ep_size == 0
@@ -27,13 +27,15 @@ def setup_ep(self):
         held_experts = self.experts[self.expert_start_idx : self.expert_start_idx + self.num_experts_per_ep]
         set_tensors_to_none(self.experts, exclude=set(held_experts))
         for p in self.experts.parameters():
-            set_moe_tensor_info(p, moe_info)
+            p.ep_group = ep_group
 
     @staticmethod
     def from_native_module(module: MixtralSparseMoeBlock, *args, **kwargs) -> "EPMixtralSparseMoeBlock":
         LazyInitContext.materialize(module)
         module.__class__ = EPMixtralSparseMoeBlock
-        module.setup_ep()
+        # if "ep_group" in kwargs:
+        assert "ep_group" in kwargs, "You should pass ep_group in SubModuleReplacementDescription via shard_config!!"
+        module.setup_ep(kwargs["ep_group"])
         return module
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
diff --git a/colossalai/shardformer/policies/mixtral.py b/colossalai/shardformer/policies/mixtral.py
index 9ff7457acb62..8dedbbfbc81f 100644
--- a/colossalai/shardformer/policies/mixtral.py
+++ b/colossalai/shardformer/policies/mixtral.py
@@ -51,6 +51,8 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
 
         if self.shard_config.enable_tensor_parallelism:
             raise NotImplementedError("Tensor parallelism is not supported for Mixtral model now.")
+        if getattr(self.shard_config, "ep_group", None) is None:
+            raise ValueError("You must pass in ep_group via shard_config for expert parallel!")
 
         # expert parallel
         self.append_or_create_submodule_replacement(
@@ -58,6 +60,7 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
                 SubModuleReplacementDescription(
                     suffix="block_sparse_moe",
                     target_module=EPMixtralSparseMoeBlock,
+                    kwargs={"ep_group": self.shard_config.ep_group},
                 )
             ],
             policy=policy,
@@ -167,7 +170,7 @@ def get_shared_params(self) -> List[Dict[int, Tensor]]:
 class MixtralForCausalLMPolicy(MixtralPolicy):
     def module_policy(self):
         policy = super().module_policy()
-
+        # TODO: assign pg mesh from plugin to all modules
         if self.shard_config.enable_tensor_parallelism:
             # add a new item for casual lm
             new_item = {
diff --git a/colossalai/shardformer/shard/shard_config.py b/colossalai/shardformer/shard/shard_config.py
index 415fc6dd5f06..5cb21d4820c8 100644
--- a/colossalai/shardformer/shard/shard_config.py
+++ b/colossalai/shardformer/shard/shard_config.py
@@ -36,6 +36,7 @@ class ShardConfig:
     enable_sequence_overlap: bool = False
     parallel_output = True
     extra_kwargs: Dict[str, Any] = field(default_factory=dict)
+    ep_group: Optional[ProcessGroup] = None
     # pipeline_parallel_size: int
     # data_parallel_size: int
     # tensor_parallel_mode: Literal['1d', '2d', '2.5d', '3d']
diff --git a/colossalai/tensor/d_tensor/__init__.py b/colossalai/tensor/d_tensor/__init__.py
index 6f8097735d57..4129ec62e956 100644
--- a/colossalai/tensor/d_tensor/__init__.py
+++ b/colossalai/tensor/d_tensor/__init__.py
@@ -2,13 +2,13 @@
     compute_global_numel,
     customized_distributed_tensor_to_param,
     distribute_tensor,
-    init_as_dtensor,
     distribute_tensor_with_customization,
-    init_tensor_as_customization_distributed,
     get_device_mesh,
     get_global_shape,
     get_layout,
     get_sharding_spec,
+    init_as_dtensor,
+    init_tensor_as_customization_distributed,
     is_customized_distributed_tensor,
     is_distributed_tensor,
     is_sharded,
diff --git a/colossalai/tensor/d_tensor/api.py b/colossalai/tensor/d_tensor/api.py
index da6ef275e108..7258170887ad 100644
--- a/colossalai/tensor/d_tensor/api.py
+++ b/colossalai/tensor/d_tensor/api.py
@@ -128,7 +128,10 @@ def distribute_tensor(tensor: torch.Tensor, device_mesh: DeviceMesh, sharding_sp
 
     return sharded_tensor
 
-def init_as_dtensor(tensor: torch.Tensor, device_mesh: DeviceMesh, sharding_spec: ShardingSpec, global_shape: torch.Size) -> torch.Tensor:
+
+def init_as_dtensor(
+    tensor: torch.Tensor, device_mesh: DeviceMesh, sharding_spec: ShardingSpec, global_shape: torch.Size
+) -> torch.Tensor:
     assert not is_distributed_tensor(tensor), "The input tensor is already a distributed tensor."
     dist_layout = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec, global_shape=global_shape)
 
@@ -140,6 +143,7 @@ def init_as_dtensor(tensor: torch.Tensor, device_mesh: DeviceMesh, sharding_spec
 
     return tensor
 
+
 def redistribute(dtensor: torch.Tensor, device_mesh: DeviceMesh, sharding_spec: ShardingSpec) -> None:
     """
     Convert the layout of the tensor from source_spec to target_spec.
@@ -468,7 +472,6 @@ def gather_fn(tensor):
     assert callable(gather_fn), "The gather_fn must be callable."
     assert not is_distributed_tensor(tensor), "The input tensor is already a distributed tensor."
 
-
     # set the shard_fn and gather_fn as attributes of the distributed tensor
     tensor.shard_fn = shard_fn
     tensor.gather_fn = gather_fn
diff --git a/colossalai/tensor/moe_tensor/api.py b/colossalai/tensor/moe_tensor/api.py
index b6843df7a478..f99a234717fa 100644
--- a/colossalai/tensor/moe_tensor/api.py
+++ b/colossalai/tensor/moe_tensor/api.py
@@ -17,7 +17,7 @@ def is_moe_tensor(tensor: torch.Tensor) -> bool:
     Returns:
         bool: Whether the given tensor is a moe tensor.
     """
-    return hasattr(tensor, "moe_info")
+    return hasattr(tensor, "ep_group")
 
 
 def set_moe_tensor_info(tensor: torch.Tensor, moe_info: MoeParallelInfo) -> None:
@@ -58,7 +58,7 @@ def get_ep_group(tensor: torch.Tensor) -> ProcessGroup:
     Returns:
         torch.distributed.ProcessGroup: The expert parallel group of the given tensor.
     """
-    return tensor.moe_info.ep_group
+    return tensor.ep_group
 
 
 def get_ep_size(tensor: torch.Tensor) -> int:
@@ -71,7 +71,8 @@ def get_ep_size(tensor: torch.Tensor) -> int:
     Returns:
         int: The expert parallel size of the given tensor.
     """
-    return tensor.moe_info.ep_size
+    assert getattr(tensor, "ep_group") is not None, "The tensor does not have expert parallel group."
+    return dist.get_world_size(tensor.ep_group)
 
 
 def get_dp_size(tensor: torch.Tensor) -> int:
diff --git a/colossalai/zero/low_level/_utils.py b/colossalai/zero/low_level/_utils.py
index de08ecf3d57f..5ab703f09063 100644
--- a/colossalai/zero/low_level/_utils.py
+++ b/colossalai/zero/low_level/_utils.py
@@ -190,6 +190,7 @@ def calculate_global_norm_from_list(norm_list):
         total_norm += norm**2.0
     return math.sqrt(total_norm)
 
+
 def sync_tensor(flat_tensor, tensor_list):
     """
     Synchronize the flattened tensor and unflattened tensor list. When
diff --git a/colossalai/zero/low_level/low_level_optim.py b/colossalai/zero/low_level/low_level_optim.py
index a2433d1b261c..442df842ae41 100644
--- a/colossalai/zero/low_level/low_level_optim.py
+++ b/colossalai/zero/low_level/low_level_optim.py
@@ -98,8 +98,8 @@ def __init__(
 
         # extra dp
         # This group is used to sync moe param, dp_world_size = moe_duplicates * extra_dp_size.
-        # Non moe param will be sync by global dp pg, moe param will be sync by extra dp pg.
-        # Moe param grad is be split as non moe param by global dp pg, and grad will be merged in step.
+        # Non moe param will sync in global dp pg, moe param will be sync by extra dp pg.
+        # Moe param grad is split as non moe param by global dp pg, and grad will be merged in step.
         # And moe working and master param are split by extra dp pg.
         self.moe_extra_dp_pg = moe_extra_dp_process_group
         if self.moe_extra_dp_pg is not None:
@@ -908,7 +908,7 @@ def update_master_params(self, model: nn.Module) -> None:
                 if padding_size > 0:
                     working_param = torch.nn.functional.pad(working_param, [0, padding_size])
                 if self.moe_extra_dp_pg is not None and is_moe_tensor(p):
-                    master_param.copy_(working_param.chunk(self.extra_dp_pg_size)[self.extra_dp_pg_rank])
+                    master_param.copy_(working_param.chunk(self.moe_extra_dp_pg_size)[self.moe_extra_dp_pg_rank])
                 else:
                     master_param.copy_(working_param.chunk(self._world_size)[self._local_rank])
         if hasattr(self, "master_moe_params"):
diff --git a/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
index 3de41601a231..c25b109e7106 100644
--- a/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
+++ b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
@@ -220,7 +220,7 @@ model, optimizer, _criterion, train_dataloader, lr_scheduler = booster.boost(
     )
 ```
 ## 使用混合并行训练 ViT
-最后就可以使用混合并行策略来训练模型了。我们先定义一个训练函数，描述训练过程。需要注意的是，如果使用了管道并行策略，需要调用`booster.execute_pipeline`来执行模型的训练，它会调用`scheduler`管理模型的前后向操作。  
+最后就可以使用混合并行策略来训练模型了。我们先定义一个训练函数，描述训练过程。需要注意的是，如果使用了管道并行策略，需要调用`booster.execute_pipeline`来执行模型的训练，它会调用`scheduler`管理模型的前后向操作。
 ```python
 def run_forward_backward(
     model: nn.Module,
diff --git a/examples/language/data_utils.py b/examples/language/data_utils.py
index ec849ef9d1eb..6b9e8ef28eb7 100644
--- a/examples/language/data_utils.py
+++ b/examples/language/data_utils.py
@@ -121,4 +121,4 @@ def __getitem__(self, idx):
             "input_ids": self.input_ids[idx],
             "attention_mask": self.attention_mask[idx],
             "labels": self.input_ids[idx],
-        }
\ No newline at end of file
+        }
diff --git a/examples/language/openmoe/benchmark/benchmark_cai.py b/examples/language/openmoe/benchmark/benchmark_cai.py
index 770c500d86bf..debe2286bd5f 100644
--- a/examples/language/openmoe/benchmark/benchmark_cai.py
+++ b/examples/language/openmoe/benchmark/benchmark_cai.py
@@ -176,7 +176,7 @@ def main():
         use_ep_inside = False
         plugin = MoeHybridParallelPlugin(
             pp_size=1,
-            extra_dp_size=args.extra_dp_size,
+            ep_size=args.ep_size,
             use_ep_inside=use_ep_inside,
             **hybrid_dict,
         )
diff --git a/examples/language/openmoe/benchmark/utils.py b/examples/language/openmoe/benchmark/utils.py
index 7a0955bb028a..096e06bd21b8 100644
--- a/examples/language/openmoe/benchmark/utils.py
+++ b/examples/language/openmoe/benchmark/utils.py
@@ -50,7 +50,6 @@ def all_reduce_mean(x: float, world_size: int) -> float:
 
 
 class Timer:
-
     def __init__(self) -> None:
         self.start_time: Optional[float] = None
         self.duration: float = 0.0
@@ -112,7 +111,7 @@ def on_step_end(self, input_ids: Tensor, **kwargs) -> None:
         batch_size, seq_len = input_ids.shape
 
         self.num_samples += batch_size
-        self.flop += (batch_size * seq_len * self.model_numel * 2 * (3 + int(self.enable_grad_checkpoint)))
+        self.flop += batch_size * seq_len * self.model_numel * 2 * (3 + int(self.enable_grad_checkpoint))
 
     def on_fit_end(self) -> None:
         avg_duration = all_reduce_mean(self.timer.duration, self.world_size)
@@ -122,5 +121,6 @@ def on_fit_end(self) -> None:
         if dist.get_rank() == 0:
             print(
                 f"num_samples: {self.num_samples}, dp_world_size: {self.dp_world_size}, flop: {self.flop}, avg_duration: {avg_duration}, "
-                f"avg_throughput: {avg_throughput}")
+                f"avg_throughput: {avg_throughput}"
+            )
             print(f"Throughput: {avg_throughput:.2f} samples/sec, TFLOPS per GPU: {avg_tflops_per_gpu:.2f}")
diff --git a/examples/language/openmoe/infer.py b/examples/language/openmoe/infer.py
index 04df64531937..50cdc63e8cd9 100644
--- a/examples/language/openmoe/infer.py
+++ b/examples/language/openmoe/infer.py
@@ -16,17 +16,15 @@ def inference(args):
     tokenizer = T5Tokenizer.from_pretrained("google/umt5-small")
     if args.model == "test":
         config = LlamaConfig.from_pretrained("hpcai-tech/openmoe-base")
-        set_openmoe_args(config,
-                         num_experts=config.num_experts,
-                         moe_layer_interval=config.moe_layer_interval,
-                         enable_kernel=True)
+        set_openmoe_args(
+            config, num_experts=config.num_experts, moe_layer_interval=config.moe_layer_interval, enable_kernel=True
+        )
         model = OpenMoeForCausalLM(config)
     else:
         config = LlamaConfig.from_pretrained(f"hpcai-tech/openmoe-{args.model}")
-        set_openmoe_args(config,
-                         num_experts=config.num_experts,
-                         moe_layer_interval=config.moe_layer_interval,
-                         enable_kernel=False)
+        set_openmoe_args(
+            config, num_experts=config.num_experts, moe_layer_interval=config.moe_layer_interval, enable_kernel=False
+        )
         model = OpenMoeForCausalLM.from_pretrained(f"hpcai-tech/openmoe-{args.model}", config=config)
     model = model.eval().bfloat16()
     model = model.to(torch.cuda.current_device())
diff --git a/examples/language/openmoe/model/convert_openmoe_ckpt.py b/examples/language/openmoe/model/convert_openmoe_ckpt.py
index 20b1e780d8b3..3e051850d1cd 100644
--- a/examples/language/openmoe/model/convert_openmoe_ckpt.py
+++ b/examples/language/openmoe/model/convert_openmoe_ckpt.py
@@ -172,9 +172,9 @@ def make_state_dict(converted_params):
 def load_t5x_weights_in_t5(model, config, t5x_checkpoint_path):
     """Replaces the params in model witht the T5X converted params."""
     variables = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
-    converted = convert_t5x_to_pytorch(variables,
-                                       num_layers=config.num_hidden_layers,
-                                       moe_interval=config.moe_layer_interval)
+    converted = convert_t5x_to_pytorch(
+        variables, num_layers=config.num_hidden_layers, moe_interval=config.moe_layer_interval
+    )
     state_dict = make_state_dict(converted)
     model.load_state_dict(state_dict, strict=True)
 
@@ -203,11 +203,9 @@ def convert_t5x_checkpoint_to_pytorch(t5x_checkpoint_path, config_file, pytorch_
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Converts a native T5X checkpoint into a PyTorch checkpoint.")
     # Required parameters
-    parser.add_argument("--t5x_checkpoint_path",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="Path to the T5X checkpoint.")
+    parser.add_argument(
+        "--t5x_checkpoint_path", default=None, type=str, required=True, help="Path to the T5X checkpoint."
+    )
     parser.add_argument(
         "--config_file",
         default=None,
@@ -215,10 +213,8 @@ def convert_t5x_checkpoint_to_pytorch(t5x_checkpoint_path, config_file, pytorch_
         required=True,
         help="The config json file corresponding to the pre-trained T5 model.\nThis specifies the model architecture.",
     )
-    parser.add_argument("--pytorch_dump_path",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="Path to the output PyTorch model.")
+    parser.add_argument(
+        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
     args = parser.parse_args()
     convert_t5x_checkpoint_to_pytorch(args.t5x_checkpoint_path, args.config_file, args.pytorch_dump_path)
diff --git a/examples/language/openmoe/model/modeling_openmoe.py b/examples/language/openmoe/model/modeling_openmoe.py
index eee3b505a22a..4a333537e392 100644
--- a/examples/language/openmoe/model/modeling_openmoe.py
+++ b/examples/language/openmoe/model/modeling_openmoe.py
@@ -37,9 +37,9 @@
 
 from colossalai.kernel.extensions.flash_attention import HAS_FLASH_ATTN
 from colossalai.kernel.triton.llama_act_combine_kernel import HAS_TRITON
-from colossalai.moe.layers import SparseMLP
 from colossalai.moe.manager import MOE_MANAGER
 from colossalai.moe.utils import get_activation, set_moe_args
+from colossalai.shardformer.layer.moe import SparseMLP
 
 if HAS_TRITON:
     from colossalai.kernel.triton.llama_act_combine_kernel import LlamaActCombine
@@ -70,7 +70,7 @@ def set_openmoe_args(
     load_balance_group_swap_factor: float = 0.4,
     enable_kernel: bool = False,
     enable_comm_overlap: bool = False,
-    enable_hierarchical_alltoall: bool = False,
+    enable_hierarchical_alltoall: bool = True,
 ) -> None:
     """
     MoE related arguments.
@@ -452,7 +452,7 @@ def __init__(self, config: LlamaConfig, moe: bool):
                 load_balance_beam_width=config.load_balance_beam_width,
                 load_balance_group_swap_factor=config.load_balance_group_swap_factor,
                 enable_kernel=config.enable_kernel,
-                enable_comm_overlap=config.enable_comm_overlap,
+                enable_hierarchical_comm=config.enable_hierarchical_alltoall,
             )
             self.pre_extra_mlp_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
             self.extra_mlp = OpenMoeMLP(config)
@@ -890,7 +890,7 @@ def forward(
         "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
         ```"""
         # reset moe loss
-        MOE_MANAGER.reset_loss()
+        MOE_MANAGER.reset_loss()  # TODO: remove
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1014,7 +1014,7 @@ def _reorder_cache(past_key_values, beam_idx):
 
     def _calculate_router_loss(self, aux_loss: list = None, z_loss: list = None):
         if aux_loss is None or z_loss is None:
-            aux_loss, z_loss = MOE_MANAGER.get_loss()
+            aux_loss, z_loss = MOE_MANAGER.get_loss()  # TODO: remove
         assert len(aux_loss) == len(z_loss) == self.config.num_hidden_layers // self.config.moe_layer_interval
         aux_loss = self.config.router_aux_loss_factor * sum(aux_loss) / len(aux_loss)
         z_loss = self.config.router_z_loss_factor * sum(z_loss) / len(z_loss)
diff --git a/examples/language/openmoe/model/openmoe_policy.py b/examples/language/openmoe/model/openmoe_policy.py
index 17e7aa46ce85..9da6800c00b1 100644
--- a/examples/language/openmoe/model/openmoe_policy.py
+++ b/examples/language/openmoe/model/openmoe_policy.py
@@ -1,4 +1,3 @@
-import warnings
 from functools import partial
 from typing import Callable, Dict, List, Optional, Union
 
@@ -21,7 +20,6 @@
 
 
 class OpenMoePolicy(Policy):
-
     def config_sanity_check(self):
         pass
 
@@ -43,7 +41,8 @@ def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
         if self.shard_config.enable_sequence_parallelism:
             self.shard_config.enable_sequence_parallelism = False
             raise NotImplementedError(
-                "openmoe doesn't support sequence parallelism now, will ignore the sequence parallelism flag.")
+                "openmoe doesn't support sequence parallelism now, will ignore the sequence parallelism flag."
+            )
 
         if self.shard_config.enable_tensor_parallelism:
             raise NotImplementedError("Tensor parallelism is not supported for openmoe model now.")
@@ -100,9 +99,9 @@ def set_pipeline_forward(self, model_cls: nn.Module, new_forward: Callable, poli
             layers_per_stage = self.distribute_layers(len(module.layers), stage_manager.num_stages)
             stage_index = Policy.get_stage_index(layers_per_stage, stage_manager.stage)
             method_replacement = {"forward": partial(new_forward, stage_manager=stage_manager, stage_index=stage_index)}
-            self.append_or_create_method_replacement(description=method_replacement,
-                                                     policy=policy,
-                                                     target_key=model_cls)
+            self.append_or_create_method_replacement(
+                description=method_replacement, policy=policy, target_key=model_cls
+            )
 
         return
 
@@ -126,12 +125,10 @@ def get_held_layers(self) -> List[Module]:
             held_layers.append(module.norm)
 
         return held_layers
-    
+
     @staticmethod
     def distribute_layers(num_layers: int, num_stages: int) -> List[int]:
-        """Divide layers into stages
-
-        """
+        """Divide layers into stages"""
         if num_layers == 24 and num_stages == 4:
             return [7, 7, 7, 3]
         elif num_layers == 24 and num_stages == 2:
@@ -146,7 +143,6 @@ def distribute_layers(num_layers: int, num_stages: int) -> List[int]:
 
 
 class OpenMoeModelPolicy(OpenMoePolicy):
-
     def __init__(self) -> None:
         super().__init__()
 
@@ -172,21 +168,22 @@ def get_shared_params(self) -> List[Dict[int, Tensor]]:
 
 
 class OpenMoeForCausalLMPolicy(OpenMoePolicy):
-
     def module_policy(self):
         policy = super().module_policy()
 
         if self.shard_config.enable_tensor_parallelism:
             # add a new item for casual lm
+            # TODO: recursively assign ep group foe all modules
             new_item = {
-                OpenMoeForCausalLM:
-                    ModulePolicyDescription(sub_module_replacement=[
+                OpenMoeForCausalLM: ModulePolicyDescription(
+                    sub_module_replacement=[
                         SubModuleReplacementDescription(
                             suffix="lm_head",
                             target_module=Linear1D_Col,
                             kwargs=dict(gather_output=True),
                         )
-                    ])
+                    ]
+                )
             }
             policy.update(new_item)
 
@@ -211,13 +208,17 @@ def get_held_layers(self) -> List[Module]:
     def get_shared_params(self) -> List[Dict[int, Tensor]]:
         llama_model = self.model.model
         if self.pipeline_stage_manager and self.pipeline_stage_manager.num_stages > 1:
-            if (id(llama_model.embed_tokens.weight) == id(self.model.lm_head.weight)
-                    and self.pipeline_stage_manager.num_stages > 1):
+            if (
+                id(llama_model.embed_tokens.weight) == id(self.model.lm_head.weight)
+                and self.pipeline_stage_manager.num_stages > 1
+            ):
                 # tie weights
-                return [{
-                    0: llama_model.embed_tokens.weight,
-                    self.pipeline_stage_manager.num_stages - 1: self.model.lm_head.weight,
-                }]
+                return [
+                    {
+                        0: llama_model.embed_tokens.weight,
+                        self.pipeline_stage_manager.num_stages - 1: self.model.lm_head.weight,
+                    }
+                ]
         return []
 
 
@@ -250,12 +251,13 @@ def openmoe_model_forward(
 
         logger = logging.get_logger(__name__)
 
-        output_attentions = (output_attentions if output_attentions is not None else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else self.config.output_hidden_states)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
 
-        return_dict = (return_dict if return_dict is not None else self.config.use_return_dict)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # retrieve input_ids and inputs_embeds
         if stage_manager.is_first_stage():
@@ -323,7 +325,8 @@ def openmoe_model_forward(
         if self.gradient_checkpointing and self.training:
             if use_cache:
                 logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
                 use_cache = False
 
         # decoder layers
@@ -336,12 +339,11 @@ def openmoe_model_forward(
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
-            past_key_value = (past_key_values[idx] if past_key_values is not None else None)
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
 
             if self.gradient_checkpointing and self.training:
 
                 def create_custom_forward(module):
-
                     def custom_forward(*inputs):
                         # None for past_key_value
                         return module(*inputs, output_attentions, None)
@@ -387,14 +389,16 @@ def custom_forward(*inputs):
             router_z_loss = past_router_z_loss + router_z_loss
 
         if stage_manager.is_last_stage():
-            return tuple([
-                hidden_states,
-                next_cache,
-                all_hidden_states,
-                all_self_attns,
-                router_aux_loss,
-                router_z_loss,
-            ])
+            return tuple(
+                [
+                    hidden_states,
+                    next_cache,
+                    all_hidden_states,
+                    all_self_attns,
+                    router_aux_loss,
+                    router_z_loss,
+                ]
+            )
         # always return dict for imediate stage
         return {
             "hidden_states": hidden_states,
@@ -448,10 +452,11 @@ def llama_for_causal_lm_forward(
         "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
         ```"""
         logger = logging.get_logger(__name__)
-        output_attentions = (output_attentions if output_attentions is not None else self.config.output_attentions)
-        output_hidden_states = (output_hidden_states
-                                if output_hidden_states is not None else self.config.output_hidden_states)
-        return_dict = (return_dict if return_dict is not None else self.config.use_return_dict)
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         # TODO(jianghai): left the recording kv-value tensors as () or None type, this feature may be added in the future.
         if output_attentions:
@@ -507,7 +512,6 @@ def llama_for_causal_lm_forward(
                 if chunk_head == True:
 
                     def create_custom_forward(module):
-
                         def custom_forward(*inputs):
                             logits = module(inputs[0])
                             logits = logits.float()
@@ -525,8 +529,8 @@ def custom_forward(*inputs):
                     for batch_idx in range(hidden_states.shape[0]):
                         loss = loss + torch.utils.checkpoint.checkpoint(
                             create_custom_forward(self.lm_head),
-                            hidden_states[batch_idx:batch_idx + 1, :],
-                            labels[batch_idx:batch_idx + 1, :],
+                            hidden_states[batch_idx : batch_idx + 1, :],
+                            labels[batch_idx : batch_idx + 1, :],
                         )
                     logits = None
                 else:
diff --git a/examples/language/openmoe/train.py b/examples/language/openmoe/train.py
index 89c4d5420994..5d2bb2d5081e 100644
--- a/examples/language/openmoe/train.py
+++ b/examples/language/openmoe/train.py
@@ -20,7 +20,6 @@
 from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
 from colossalai.cluster import DistCoordinator
 from colossalai.moe.layers import apply_load_balance
-from colossalai.moe.manager import MOE_MANAGER
 from colossalai.moe.utils import skip_init
 from colossalai.nn.optimizer import HybridAdam
 
@@ -221,48 +220,49 @@ def main():
         "precision": args.precision,
         "zero_stage": args.zero_stage,
     }
-    mgr_dict = {}
     if args.plugin == "ep":
         dp_size = dist.get_world_size()
         plugin = MoeHybridParallelPlugin(
             pp_size=1,
+            ep_size=args.ep_size,
             **hybrid_dict,
         )
-        MOE_MANAGER.setup(
-            parallel="EP",
-            max_ep_size=dp_size,
-            **mgr_dict,
-        )
+        # MOE_MANAGER.setup(
+        #     parallel="EP",
+        #     max_ep_size=dp_size,
+        #     **mgr_dict,
+        # )
     elif args.plugin == "ep_zero":
         dp_size = dist.get_world_size()
         use_ep_inside = False
         plugin = MoeHybridParallelPlugin(
             pp_size=1,
-            extra_dp_size=args.extra_dp_size,
+            ep_size=dp_size // args.ep_size,
             use_ep_inside=use_ep_inside,
             **hybrid_dict,
         )
-        MOE_MANAGER.setup(
-            parallel="EP",
-            max_ep_size=dp_size // args.extra_dp_size,
-            use_ep_inside=use_ep_inside,
-            **mgr_dict,
-        )
+        # MOE_MANAGER.setup(
+        #     parallel="EP",
+        #     max_ep_size=dp_size // args.extra_dp_size,
+        #     use_ep_inside=use_ep_inside,
+        #     **mgr_dict,
+        # )
     elif args.plugin == "hybrid":
         dp_size = dist.get_world_size() // args.pp_size
         plugin = MoeHybridParallelPlugin(
             pp_size=args.pp_size,
+            ep_size=args.ep_size,
             microbatch_size=args.microbatch_size,
             **hybrid_dict,
         )
-        MOE_MANAGER.setup(
-            parallel="EP",
-            mode="fixed",
-            fixed_dp_size=args.dp_size,
-            fixed_ep_size=args.ep_size,
-            fixed_pp_size=args.pp_size,
-            **mgr_dict,
-        )
+        # MOE_MANAGER.setup(
+        #     parallel="EP",
+        #     mode="fixed",
+        #     fixed_dp_size=args.dp_size,
+        #     fixed_ep_size=args.ep_size,
+        #     fixed_pp_size=args.pp_size,
+        #     **mgr_dict,
+        # )
     else:
         raise ValueError(f"Invalid plugin {args.plugin}")
     coordinator.print_on_master(f"Set plugin as {plugin.__class__.__name__}")
diff --git a/extensions/cpu_adam/__init__.py b/extensions/cpu_adam/__init__.py
index cfd26a6a20f8..d5c69e902a80 100644
--- a/extensions/cpu_adam/__init__.py
+++ b/extensions/cpu_adam/__init__.py
@@ -1,5 +1,4 @@
 from .cpu_adam_arm import CpuAdamArmExtension
 from .cpu_adam_x86 import CpuAdamX86Extension
 
-__all__ = ['CpuAdamArmExtension', 'CpuAdamX86Extension']
-
+__all__ = ["CpuAdamArmExtension", "CpuAdamX86Extension"]
diff --git a/extensions/layernorm/__init__.py b/extensions/layernorm/__init__.py
index 9d1bd2d019ee..30e6c68eff89 100644
--- a/extensions/layernorm/__init__.py
+++ b/extensions/layernorm/__init__.py
@@ -1,3 +1,3 @@
 from .layernorm_cuda import LayerNormCudaExtension
 
-__all__ = ["LayerNormCudaExtension"]
\ No newline at end of file
+__all__ = ["LayerNormCudaExtension"]
diff --git a/extensions/moe/__init__.py b/extensions/moe/__init__.py
index 962084d4bdde..3b6aa24bf7f6 100644
--- a/extensions/moe/__init__.py
+++ b/extensions/moe/__init__.py
@@ -1,3 +1,3 @@
 from .moe_cuda import MoeCudaExtension
 
-__all__ = ['MoeCudaExtension']
\ No newline at end of file
+__all__ = ["MoeCudaExtension"]
diff --git a/extensions/optimizer/__init__.py b/extensions/optimizer/__init__.py
index 9c8e87cae5de..6a0c8d7b8016 100644
--- a/extensions/optimizer/__init__.py
+++ b/extensions/optimizer/__init__.py
@@ -1,3 +1,3 @@
 from .fused_optimizer_cuda import FusedOptimizerCudaExtension
 
-__all__ = ['FusedOptimizerCudaExtension']
\ No newline at end of file
+__all__ = ["FusedOptimizerCudaExtension"]
diff --git a/extensions/softmax/__init__.py b/extensions/softmax/__init__.py
index 9bc50c6cd91c..8833d93e73d0 100644
--- a/extensions/softmax/__init__.py
+++ b/extensions/softmax/__init__.py
@@ -1,4 +1,4 @@
 from .scaled_masked_softmax_cuda import ScaledMaskedSoftmaxCudaExtension
 from .scaled_upper_triangle_masked_softmax_cuda import ScaledUpperTriangleMaskedSoftmaxCudaExtension
 
-__all__ = ['ScaledMaskedSoftmaxCudaExtension', 'ScaledUpperTriangleMaskedSoftmaxCudaExtension'] 
\ No newline at end of file
+__all__ = ["ScaledMaskedSoftmaxCudaExtension", "ScaledUpperTriangleMaskedSoftmaxCudaExtension"]
diff --git a/tests/kit/model_zoo/__init__.py b/tests/kit/model_zoo/__init__.py
index 5f6789ff3357..66c794a7d891 100644
--- a/tests/kit/model_zoo/__init__.py
+++ b/tests/kit/model_zoo/__init__.py
@@ -1,33 +1,33 @@
 import os
+
 from . import custom, diffusers, timm, torchaudio, torchvision, transformers
 from .executor import run_fwd, run_fwd_bwd
 from .registry import model_zoo
 
 # We pick a subset of models for fast testing in order to reduce the total testing time
 COMMON_MODELS = [
-    'custom_hanging_param_model',
-    'custom_nested_model',
-    'custom_repeated_computed_layers',
-    'custom_simple_net',
-    'diffusers_clip_text_model',
-    'diffusers_auto_encoder_kl',
-    'diffusers_unet2d_model',
-    'timm_densenet',
-    'timm_resnet',
-    'timm_swin_transformer',
-    'torchaudio_wav2vec2_base',
-    'torchaudio_conformer',
-    'transformers_bert_for_masked_lm',
-    'transformers_bloom_for_causal_lm',
-    'transformers_falcon_for_causal_lm',
-    'transformers_chatglm_for_conditional_generation',
-    'transformers_llama_for_casual_lm',
-    'transformers_vit_for_masked_image_modeling',
-    'transformers_mistral_for_casual_lm'
+    "custom_hanging_param_model",
+    "custom_nested_model",
+    "custom_repeated_computed_layers",
+    "custom_simple_net",
+    "diffusers_clip_text_model",
+    "diffusers_auto_encoder_kl",
+    "diffusers_unet2d_model",
+    "timm_densenet",
+    "timm_resnet",
+    "timm_swin_transformer",
+    "torchaudio_wav2vec2_base",
+    "torchaudio_conformer",
+    "transformers_bert_for_masked_lm",
+    "transformers_bloom_for_causal_lm",
+    "transformers_falcon_for_causal_lm",
+    "transformers_chatglm_for_conditional_generation",
+    "transformers_llama_for_casual_lm",
+    "transformers_vit_for_masked_image_modeling",
+    "transformers_mistral_for_casual_lm",
 ]
 
-IS_FAST_TEST = os.environ.get('FAST_TEST', '0') == '1'
-
+IS_FAST_TEST = os.environ.get("FAST_TEST", "0") == "1"
 
-__all__ = ["model_zoo", "run_fwd", "run_fwd_bwd", 'COMMON_MODELS', 'IS_FAST_TEST']
 
+__all__ = ["model_zoo", "run_fwd", "run_fwd_bwd", "COMMON_MODELS", "IS_FAST_TEST"]
diff --git a/tests/kit/model_zoo/registry.py b/tests/kit/model_zoo/registry.py
index fce81ab52c2b..a16b16ad6af7 100644
--- a/tests/kit/model_zoo/registry.py
+++ b/tests/kit/model_zoo/registry.py
@@ -102,4 +102,4 @@ def get_sub_registry(
         return new_dict
 
 
-model_zoo = ModelZooRegistry()
\ No newline at end of file
+model_zoo = ModelZooRegistry()
diff --git a/tests/kit/model_zoo/transformers/chatglm2.py b/tests/kit/model_zoo/transformers/chatglm2.py
index e27fdb4e2efe..0b178d58ce33 100644
--- a/tests/kit/model_zoo/transformers/chatglm2.py
+++ b/tests/kit/model_zoo/transformers/chatglm2.py
@@ -2,6 +2,7 @@
 
 from colossalai.shardformer.modeling.chatglm2_6b.configuration_chatglm import ChatGLMConfig
 from colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm import ChatGLMForConditionalGeneration, ChatGLMModel
+
 from ..registry import ModelAttribute, model_zoo
 
 # ================================
diff --git a/tests/test_booster/test_plugin/test_3d_plugin.py b/tests/test_booster/test_plugin/test_3d_plugin.py
index 285c4866c441..d629e769d715 100644
--- a/tests/test_booster/test_plugin/test_3d_plugin.py
+++ b/tests/test_booster/test_plugin/test_3d_plugin.py
@@ -276,4 +276,4 @@ def test_gemini_plugin(early_stop: bool = True):
 
 
 if __name__ == "__main__":
-    test_gemini_plugin(early_stop=False)
\ No newline at end of file
+    test_gemini_plugin(early_stop=False)
diff --git a/tests/test_checkpoint_io/test_torch_fsdp_checkpoint_io.py b/tests/test_checkpoint_io/test_torch_fsdp_checkpoint_io.py
index dca562a3b837..1ea70368eabf 100644
--- a/tests/test_checkpoint_io/test_torch_fsdp_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_torch_fsdp_checkpoint_io.py
@@ -113,6 +113,7 @@ def run_model():
         full_osd = FSDP.full_optim_state_dict(optimizer.unwrap_model().unwrap(), optim=optimizer)
 
         import copy
+
         sharded_osd = copy.deepcopy(full_osd)
 
         run_model()
diff --git a/tests/test_gptq/test_gptq_linear.py b/tests/test_gptq/test_gptq_linear.py
index 9b650aa78112..ded70fa43c30 100644
--- a/tests/test_gptq/test_gptq_linear.py
+++ b/tests/test_gptq/test_gptq_linear.py
@@ -1,16 +1,8 @@
-import math
-import time
-
-import numpy as np
 import pytest
 import torch
-import torch.nn as nn
-import transformers
 from packaging import version
 
 try:
-    import triton
-    import triton.language as tl
     HAS_TRITON = True
 except ImportError:
     HAS_TRITON = False
@@ -22,6 +14,7 @@
     from exllama_kernels import prepare_buffers, set_tuning_params
 
     from colossalai.inference.quant.gptq import CaiQuantLinear
+
     HAS_AUTO_GPTQ = True
 except:
     HAS_AUTO_GPTQ = False
@@ -32,13 +25,14 @@
 HAS_GPTQ_CUDA = False
 try:
     from colossalai.kernel.op_builder.gptq import GPTQBuilder
+
     gptq_cuda = GPTQBuilder().load()
     HAS_GPTQ_CUDA = True
 except ImportError:
-    warnings.warn('CUDA gptq is not installed')
+    warnings.warn("CUDA gptq is not installed")
     HAS_GPTQ_CUDA = False
 
-TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse('11.4')
+TRITON_CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.4")
 
 max_inner_outer_dim = 1
 max_input_len = 1
@@ -64,9 +58,9 @@ def init_buffer(cai_linear, use_act_order=False):
         max_input_len = 4096
     # The temp_state buffer is required to reorder X in the act-order case.
     # The temp_dq buffer is required to dequantize weights when using cuBLAS, typically for the prefill.
-    gptq_temp_state_buffer = torch.zeros((max_input_len, max_inner_outer_dim),
-                                         dtype=torch.float16,
-                                         device=torch.cuda.current_device())
+    gptq_temp_state_buffer = torch.zeros(
+        (max_input_len, max_inner_outer_dim), dtype=torch.float16, device=torch.cuda.current_device()
+    )
     gptq_temp_dq_buffer = torch.zeros((1, max_dq_buffer_size), dtype=torch.float16, device=torch.cuda.current_device())
 
     gptq_cuda.prepare_buffers(torch.device(torch.cuda.current_device()), gptq_temp_state_buffer, gptq_temp_dq_buffer)
@@ -77,10 +71,11 @@ def init_buffer(cai_linear, use_act_order=False):
     gptq_cuda.set_tuning_params(matmul_recons_thd, matmul_fused_remap, matmul_no_half2)
 
 
-@pytest.mark.skipif(not TRITON_CUDA_SUPPORT or not HAS_TRITON or not HAS_AUTO_GPTQ,
-                    reason="triton requires cuda version to be higher than 11.4 or not install auto-gptq")
+@pytest.mark.skipif(
+    not TRITON_CUDA_SUPPORT or not HAS_TRITON or not HAS_AUTO_GPTQ,
+    reason="triton requires cuda version to be higher than 11.4 or not install auto-gptq",
+)
 def test_gptq_linear():
-
     infeature = 1024
     outfeature = 1024
     group_size = 128
@@ -120,7 +115,7 @@ def test_gptq_linear():
     max_input_len = 2048
     buffers = {
         "temp_state": torch.zeros((max_input_len, max_inner_outer_dim), dtype=torch.float16, device=device),
-        "temp_dq": torch.zeros((1, max_dq_buffer_size), dtype=torch.float16, device=device)
+        "temp_dq": torch.zeros((1, max_dq_buffer_size), dtype=torch.float16, device=device),
     }
 
     prepare_buffers(device, buffers["temp_state"], buffers["temp_dq"])
@@ -146,5 +141,4 @@ def test_gptq_linear():
 
 
 if __name__ == "__main__":
-
     test_gptq_linear()
diff --git a/tests/test_lazy/test_models.py b/tests/test_lazy/test_models.py
index aeca5f21dc1d..d0c4cd0a7c48 100644
--- a/tests/test_lazy/test_models.py
+++ b/tests/test_lazy/test_models.py
@@ -24,4 +24,4 @@ def test_torchvision_models_lazy_init(subset, default_device):
 
 
 if __name__ == "__main__":
-    test_torchvision_models_lazy_init("transformers", "cpu")
\ No newline at end of file
+    test_torchvision_models_lazy_init("transformers", "cpu")
diff --git a/tests/test_moe/test_moe_load_balance.py b/tests/test_moe/test_moe_load_balance.py
index 717bb99fb830..33b8a4a47e93 100644
--- a/tests/test_moe/test_moe_load_balance.py
+++ b/tests/test_moe/test_moe_load_balance.py
@@ -6,8 +6,8 @@
 from colossalai.booster import Booster
 from colossalai.booster.plugin import LowLevelZeroPlugin
 from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel
-from colossalai.moe.layers import apply_load_balance
 from colossalai.moe.manager import MOE_MANAGER
+from colossalai.shardformer.layer.moe import apply_load_balance
 from colossalai.tensor.moe_tensor.api import is_moe_tensor
 from colossalai.testing import rerun_if_address_is_in_use, spawn
 from tests.test_moe.moe_utils import MoeGradientHandler, MoeModel
diff --git a/tests/test_shardformer/test_layer/test_dist_crossentropy.py b/tests/test_shardformer/test_layer/test_dist_crossentropy.py
index f594a80a43e0..414157c2233d 100644
--- a/tests/test_shardformer/test_layer/test_dist_crossentropy.py
+++ b/tests/test_shardformer/test_layer/test_dist_crossentropy.py
@@ -38,9 +38,10 @@ def check_dist_crossentropy(rank, world_size, port, ignore_index):
         org_loss, dist_loss, atol=1e-5
     ), f"dist cross entropy loss is not equal to orgin loss\n{org_loss}\n{dist_loss}"
 
-
     target_grad = torch.chunk(pred.grad, world_size, dim=-1)[rank]
-    assert torch.allclose(target_grad, dist_pred.grad), f"dist grad is not equal to orgin grad\n{target_grad}\n{dist_pred.grad}"
+    assert torch.allclose(
+        target_grad, dist_pred.grad
+    ), f"dist grad is not equal to orgin grad\n{target_grad}\n{dist_pred.grad}"
 
 
 @pytest.mark.dist
diff --git a/tests/test_shardformer/test_model/test_shard_gptj.py b/tests/test_shardformer/test_model/test_shard_gptj.py
index c83eaaa09e29..0bf9669808fa 100644
--- a/tests/test_shardformer/test_model/test_shard_gptj.py
+++ b/tests/test_shardformer/test_model/test_shard_gptj.py
@@ -207,6 +207,7 @@ def check_gptj_3d(rank, world_size, port):
     colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
     run_gptj_3d_test()
 
+
 @pytest.mark.skip("TODO check_gptj has something wrong.")
 @pytest.mark.dist
 @rerun_if_address_is_in_use()