hpcaitech · Edenzzzz · Sep 10, 2024 · Jun 28, 2024 · Jun 28, 2024 · Jun 29, 2024
@@ -1097,13 +1097,19 @@ def all_to_all_comm(input_, process_group=None, scatter_dim=2, gather_dim=1, fp8
     return _AllToAll.apply(input_, process_group, scatter_dim, gather_dim, fp8_communication)
 
 
-def gather_sp_output(hidden_states, sp_group, sp_mode, fp8_communication=False):
+def gather_sp_output(hidden_states, shard_config, sp_dim=1):
     """
     Gather the output of the last layer for cross entropy computation
     """
+    sp_group = shard_config.sequence_parallel_process_group
+    sp_mode = shard_config.sequence_parallelism_mode
+    fp8_comm = shard_config.fp8_communication
+    if dist.get_world_size(sp_group) == 1:
+        return hidden_states
+
     # Rescale grad (HybridParallelPlugin applies ZeRO grad averaging on the DP * SP group)
     scale = None if is_share_sp_tp(sp_mode) else dist.get_world_size(sp_group)
     hidden_states = gather_forward_split_backward(
-        hidden_states, 1, sp_group, grad_scale=scale, fp8_communication=fp8_communication
+        hidden_states, sp_dim, sp_group, grad_scale=scale, fp8_communication=fp8_comm
     )
     return hidden_states
@@ -433,7 +433,6 @@ def get_double_ring_groups(sp_group, inner_ring_size=None):
         assert (
             sp_size % inner_ring_size == 0
         ), f"sp_size {sp_size} should be divisible by inner_ring_size {inner_ring_size}"
-
         logger = get_dist_logger()
         logger.info(
             f"Using 2D Ring Attention with inner ring size {inner_ring_size} to maximze NIC util for inter-node comm. Cross your fingers for speed-ups!",
@@ -898,6 +897,7 @@ def backward(ctx, dout, _):
 
         local_sp_rank = dist.get_rank(sp_group)
         sp_size = dist.get_world_size(sp_group)
+
         # Using separate streams (pg) for concurrent kv and dkv comm may
         # cause NCCL "software caused connection abort" here...
         local_kv_comm = RingComm(local_kv_group)
@@ -1119,9 +1119,14 @@ def prepare_varlen_batch(
                 the batch dim to a packed 1d sequence. Contingent on model forward shape definitions.
 
         Returns:
-            inputs_embeds: Packed input embeddings of shape [B, Sq // sp_size, ...].
-            mask_info: A dictionary of mask info.
-            position_ids: Packed position ids of shape [..., Sq // sp_size].
+            torch.Tensor:
+                Packed input embeddings of shape [B, Sq // sp_size, ...].
+
+            Dict[str, Any]:
+                A dictionary containing mask info.
+
+            torch.Tensor:
+                Packed position ids of shape [..., Sq // sp_size].
 
         """
         _load_varlen_helpers()

@@ -153,7 +153,6 @@ def dist_cross_entropy(
     labels: torch.Tensor,  # [B, S] or [B, S, Vocab_size]
     logits: torch.Tensor,  # [B, S, Vocab_size]
     shard_config: ShardConfig,
-    out_features: int,
     vocab_size: int,
     dtype: torch.dtype,
     seq_dim: int = 1,
@@ -226,13 +225,13 @@ def dist_cross_entropy(
             logits,
             labels,
             process_group=shard_config.tensor_parallel_process_group,
-            vocab_size=out_features,
+            vocab_size=vocab_size,
             dtype=dtype,
             mode="sum",
         )
     else:
         # NOTE if use TP and not parallel_output, the output is gathered in VocabParallelLMHead1D
-        logits = logits.view(-1, vocab_size)
+        logits = logits.view(-1, logits.size(-1))
         loss = loss_fct(logits, labels)
 
     # Reduce loss instead of gathering logits over seq dim for savings

@@ -313,19 +313,19 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
 
         # Matrix multiply.
         bias = self.bias if not self.skip_bias_add else None
-
-        if self.seq_parallel_mode is None:
-            # Set up backprop all-reduce.
-            input_parallel = reduce_backward(input_, self.process_group, fp8_communication=self.fp8_communication)
-            output_parallel = matmul_with_async_comm(
+        if self.seq_parallel_mode == "split_gather":
+            input_parallel = input_
+            output_parallel = matmul_gather_forward_reducescatter_backward(
                 input_parallel,
                 self.weight,
                 bias,
                 self.process_group,
-                self.async_communication,
+                True,
+                1,
+                self.overlap,
                 fp8_communication=self.fp8_communication,
             )
-        elif self.seq_parallel_mode == "split_gather":
+        elif self.seq_parallel_mode == "ring":
             input_parallel = input_
             output_parallel = matmul_gather_forward_reducescatter_backward(
                 input_parallel,
@@ -335,13 +335,22 @@ def forward(self, input_: Tensor) -> Tuple[Tensor, Tensor]:
                 True,
                 1,
                 self.overlap,
+                True,
                 fp8_communication=self.fp8_communication,
             )
-        elif self.seq_parallel_mode == "ring":
-            input_parallel = input_
-            output_parallel = matmul_gather_forward_reducescatter_backward(
-                input_parallel, self.weight, bias, self.process_group, True, 1, self.overlap, True
+        elif self.seq_parallel_mode is None or self.seq_parallel_mode == "ring_attn":
+            # Set up backprop all-reduce.
+            input_parallel = reduce_backward(input_, self.process_group)
+            output_parallel = matmul_with_async_comm(
+                input_parallel,
+                self.weight,
+                bias,
+                self.process_group,
+                self.async_communication,
+                fp8_communication=self.fp8_communication,
             )
+        else:
+            raise NotImplementedError(f"seq_parallel_mode={self.seq_parallel_mode} is not supported!")
 
         if self.gather_output:
             # All-gather across the partitions.
@@ -553,7 +562,7 @@ def forward(self, input_: Tensor) -> Tensor:
                     handle.wait()
                 output = torch.cat(output_parallel_list, dim=-1)
         else:
-            if self.seq_parallel_mode is None:
+            if self.seq_parallel_mode is None or self.seq_parallel_mode == "ring_attn":
                 output_parallel = torch.matmul(input_, self.weight)
                 output = reduce_forward(output_parallel, self.process_group, fp8_communication=self.fp8_communication)
             elif self.seq_parallel_mode == "split_gather":
@@ -567,8 +576,12 @@ def forward(self, input_: Tensor) -> Tensor:
             elif self.seq_parallel_mode == "ring":
                 output_parallel = torch.matmul(input_, self.weight)
                 output = reducescatter_forward_gather_backward(
-                    output_parallel, self.process_group, 1, self.fp8_communication
+                    output_parallel,
+                    self.process_group,
+                    1,
                 )
+            else:
+                raise NotImplementedError(f"seq_parallel_mode={self.seq_parallel_mode} is not supported!")
 
         if not self.skip_bias_add:
             if self.bias is not None:

@@ -309,6 +309,9 @@ def split_batch_zigzag(
     """
     sp_size = dist.get_world_size(sp_group)
     sp_rank = dist.get_rank(sp_group)
+    if sp_size == 1:
+        return batch
+
     if isinstance(batch, torch.Tensor):
         batch = [batch]
     seq_dim = seq_dim if seq_dim != -1 else batch[0].dim() - 1
@@ -364,6 +367,9 @@ def split_varlen_zigzag(
     """
     sp_size = dist.get_world_size(sp_group)
     sp_rank = dist.get_rank(sp_group)
+    if sp_size == 1:
+        return batch
+
     if is_2d:
         assert max_seqlen > 0, "max_seqlen must be provided for 2D input"
 

@@ -365,14 +365,15 @@ def bloom_for_causal_lm_forward(
             hidden_states = transformer_outputs[0]
             lm_logits = self.lm_head(hidden_states).contiguous()
 
-            loss = dist_cross_entropy(
-                labels,
-                lm_logits,
-                shard_config,
-                self.lm_head.out_features,
-                self.config.vocab_size,
-                self.transformer.dtype,
-            )
+            loss = None
+            if labels is not None:
+                loss = dist_cross_entropy(
+                    labels,
+                    lm_logits,
+                    shard_config,
+                    self.lm_head.out_features,
+                    self.transformer.dtype,
+                )
 
             if not return_dict:
                 output = (lm_logits,) + transformer_outputs[1:]
@@ -1036,9 +1037,11 @@ def forward(
         hidden_states = transformer_outputs[0]
         lm_logits = self.lm_head(hidden_states)
 
-        loss = dist_cross_entropy(
-            labels, lm_logits, shard_config, self.lm_head.out_features, self.config.vocab_size, self.transformer.dtype
-        )
+        loss = None
+        if labels is not None:
+            loss = dist_cross_entropy(
+                labels, lm_logits, shard_config, self.lm_head.out_features, self.transformer.dtype
+            )
 
         if not return_dict:
             output = (lm_logits,) + transformer_outputs[1:]