fix pipeline engine (#1473)

* fix pp * better fix

fix pipeline engine (#1473)
* fix pp * better fix
17a479dd · Conglong Li · GitHub · 58a8e13c · 17a479dd
隐藏空白更改
内联并排

Showing with 21 addition and 3 deletion

deepspeed/runtime/pipe/engine.py deepspeed/runtime/pipe/engine.py +21 -3

未找到文件。
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -1060,7 +1060,7 @@ class PipelineEngine(DeepSpeedEngine):
                local_part=outputs[1],
                group=self.grid.get_slice_parallel_group())
            outputs[0].data = part_output.full()
-            outputs = ([outputs[0], *outputs[2:]])
+            outputs = (outputs[0], *outputs[2:])
            # save for backward
            self.pipe_buffers['outputs'][buffer_id] = outputs

@@ -1072,8 +1072,26 @@ class PipelineEngine(DeepSpeedEngine):
                                                        dtype=outputs.dtype,
                                                        num_buffers=1)[0]
            else:
-                sizes_and_dtypes = [(list(t.size()),
-                                     t.dtype) for t in outputs if t.is_floating_point()]
+                # XXX This is a HACK
+                # When we exchange activations/gradients, the two pipe stages
+                # need to issue the send/recv with the same buffer sizes or
+                # else there is a deadlock. The is_floating_point() filter is
+                # used to avoid sending gradients for tensors that do not
+                # produce gradients. When TP>1, we partition the first
+                # activations/gradients across TP ranks to save communication
+                # volume and memory. That partitioned tensor is represented as
+                # two tensors: a 1/TPth chunk of the original data and also a
+                # small LongTensor storing the metadata used to reconstruct on
+                # the other side. When combined, the floating point filter also
+                # filtered out the metadata tensor. A quick (hacky) fix just
+                # branches on is_grad_partitioned so we don't filter out the
+                # metadata tensor.
+                if self.is_grad_partitioned:
+                    sizes_and_dtypes = [(list(t.size()), t.dtype) for t in outputs]
+                else:
+                    sizes_and_dtypes = [(list(t.size()),
+                                         t.dtype) for t in outputs
+                                        if t.is_floating_point()]
                self.grad_layer = self._allocate_buffers(sizes_and_dtypes,
                                                         num_buffers=1)[0]