From ad4fe8aaf6209e17edeb2d9978c0a36be4e3fb28 Mon Sep 17 00:00:00 2001 From: fc Date: Fri, 10 May 2024 10:16:31 +0800 Subject: [PATCH] [FIX] fix deadlock in PipeEngine._exec_recv_grads --- deepspeed/runtime/pipe/engine.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py index be8fe1a368c6..7bb6f9a5018d 100644 --- a/deepspeed/runtime/pipe/engine.py +++ b/deepspeed/runtime/pipe/engine.py @@ -1204,9 +1204,7 @@ def _exec_recv_grads(self, buffer_id): # branches on is_grad_partitioned so we don't filter out the # metadata tensor. if self.is_grad_partitioned: - sizes_and_dtypes = [(list(t.size()), t.dtype) - for t in outputs[:2]] + [(list(t.size()), t.dtype) - for t in outputs[2:] if t.is_floating_point()] + sizes_and_dtypes = [(list(t.size()), t.dtype) for t in outputs[:2]] else: sizes_and_dtypes = [(list(t.size()), t.dtype) for t in outputs if t.is_floating_point()] self.grad_layer = self._allocate_buffers(sizes_and_dtypes, num_buffers=1)[0]