diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py index 6b56f8c7ad18fa4329ce8f6c139bc083869587ea..8ecc0663fe952ec2ebcb5e93c816d0899ef7fcae 100644 --- a/deepspeed/runtime/pipe/engine.py +++ b/deepspeed/runtime/pipe/engine.py @@ -1060,7 +1060,7 @@ class PipelineEngine(DeepSpeedEngine): local_part=outputs[1], group=self.grid.get_slice_parallel_group()) outputs[0].data = part_output.full() - outputs = ([outputs[0], *outputs[2:]]) + outputs = (outputs[0], *outputs[2:]) # save for backward self.pipe_buffers['outputs'][buffer_id] = outputs @@ -1072,8 +1072,26 @@ class PipelineEngine(DeepSpeedEngine): dtype=outputs.dtype, num_buffers=1)[0] else: - sizes_and_dtypes = [(list(t.size()), - t.dtype) for t in outputs if t.is_floating_point()] + # XXX This is a HACK + # When we exchange activations/gradients, the two pipe stages + # need to issue the send/recv with the same buffer sizes or + # else there is a deadlock. The is_floating_point() filter is + # used to avoid sending gradients for tensors that do not + # produce gradients. When TP>1, we partition the first + # activations/gradients across TP ranks to save communication + # volume and memory. That partitioned tensor is represented as + # two tensors: a 1/TPth chunk of the original data and also a + # small LongTensor storing the metadata used to reconstruct on + # the other side. When combined, the floating point filter also + # filtered out the metadata tensor. A quick (hacky) fix just + # branches on is_grad_partitioned so we don't filter out the + # metadata tensor. + if self.is_grad_partitioned: + sizes_and_dtypes = [(list(t.size()), t.dtype) for t in outputs] + else: + sizes_and_dtypes = [(list(t.size()), + t.dtype) for t in outputs + if t.is_floating_point()] self.grad_layer = self._allocate_buffers(sizes_and_dtypes, num_buffers=1)[0]