avoid oom

grimoire · grimoire · commit 376edd4197a1 · 2025-11-08T22:03:32.000+08:00
diff --git a/lmdeploy/pytorch/model_inputs.py b/lmdeploy/pytorch/model_inputs.py
@@ -28,8 +28,11 @@ def _gather_tp_sizes(tp: int, seqlen: int, dist_ctx: dist.DistContext, layer_typ
         if tp > 1 and tp != attn_tp:
             dist_group = dist.get_dist_group(layer_type=layer_type)
             gather_group = dist_group.gpu_gather_group
-            tp_sizes = [None for _ in range(gather_group.size())]
-            dist.all_gather_object(tp_sizes, seqlen, group=gather_group)
+            rank = gather_group.rank()
+            tp_size_tensor = torch.zeros(gather_group.size(), dtype=torch.int32, device='cuda')
+            tp_size_tensor[rank].fill_(seqlen)
+            dist.all_gather_into_tensor(tp_size_tensor, tp_size_tensor[rank], group=gather_group)
+            tp_sizes = tp_size_tensor.tolist()
         else:
             tp_sizes = [seqlen]
         return tp_sizes
diff --git a/lmdeploy/pytorch/nn/moe.py b/lmdeploy/pytorch/nn/moe.py
@@ -84,10 +84,10 @@ def __init__(self, gemm_func: Callable, max_tokens_per_round: int = 4096):
     def all_gather(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor,
                    tp_sizes: List[int]):
         """All gather."""
-        hidden_states, _ = dist.gather_by_tp_sizes(hidden_states, tp_sizes, group=self.gather_group, async_op=True)
-        topk_weights, _ = dist.gather_by_tp_sizes(topk_weights, tp_sizes, group=self.gather_group, async_op=True)
-        topk_ids, handle = dist.gather_by_tp_sizes(topk_ids, tp_sizes, group=self.gather_group, async_op=True)
-        return hidden_states, topk_weights, topk_ids, handle
+        hidden_states, h0 = dist.gather_by_tp_sizes(hidden_states, tp_sizes, group=self.gather_group, async_op=True)
+        topk_weights, h1 = dist.gather_by_tp_sizes(topk_weights, tp_sizes, group=self.gather_group, async_op=True)
+        topk_ids, h2 = dist.gather_by_tp_sizes(topk_ids, tp_sizes, group=self.gather_group, async_op=True)
+        return hidden_states, topk_weights, topk_ids, (h0, h1, h2)
 
     def reduce_scatter(self, hidden_states: torch.Tensor, out_states: torch.Tensor, tp_sizes: List[int]):
         """Reduce scatter."""
@@ -100,9 +100,10 @@ def reduce_scatter(self, hidden_states: torch.Tensor, out_states: torch.Tensor,
         return out_states, handle
 
     def _gemm_and_reduce_scatter(self, hidden_states: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor,
-                                 output_states: torch.Tensor, tp_sizes: List[int], handle: dist.Work):
+                                 output_states: torch.Tensor, tp_sizes: List[int], handles: List[dist.Work]):
         """Gemm and reduce scatter."""
-        handle.wait()
+        for handle in handles:
+            handle.wait()
         cur_out = self.gemm_func(hidden_states, topk_weights, topk_ids)
         return self.reduce_scatter(cur_out, output_states, tp_sizes)
 
@@ -129,13 +130,13 @@ def __slice_and_gather():
             cur_output, output_states = __slice_tensor(output_states, slice_size)
 
             # all gather
-            cur_hidden_states, cur_topk_weights, cur_topk_ids, handle = self.all_gather(
+            cur_hidden_states, cur_topk_weights, cur_topk_ids, handles = self.all_gather(
                 cur_hidden_states, cur_topk_weights, cur_topk_ids, cur_tp_sizes)
             return dict(hidden_states=cur_hidden_states,
                         topk_weights=cur_topk_weights,
                         topk_ids=cur_topk_ids,
                         output_states=cur_output,
-                        handle=handle,
+                        handles=handles,
                         tp_sizes=cur_tp_sizes)
 
         step_ctx = get_step_ctx_manager().current_context()
@@ -149,15 +150,19 @@ def __slice_and_gather():
         # pre
         cur_inputs = __slice_and_gather()
 
+        out_handles = []
         # main loop
         while tp_sizes.sum() > 0:
             next_inputs = __slice_and_gather()
-            self._gemm_and_reduce_scatter(**cur_inputs)
+            _, handle = self._gemm_and_reduce_scatter(**cur_inputs)
+            out_handles.append(handle)
             cur_inputs = next_inputs
 
         # post
         _, handle = self._gemm_and_reduce_scatter(**cur_inputs)
-        handle.wait()
+        out_handles.append(handle)
+        for handle in out_handles:
+            handle.wait()
         return return_states