xpu support fuse allreduce (#31104)

b8bce682 · WangXi · GitHub · 59b00e8c · b8bce682 · b8bce682
5 changed file
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -76,14 +76,10 @@ void FusedAllReduceOpHandle::RunImpl() {
          "handles is %d, and the number of  output variable handles is %d.",
          in_var_handles.size(), out_var_handles.size()));

-// Note: some gradient op doesn't have CUDAKernel, so the gradients of
-// those op are in CPUPlace, in this case, the all reduce should not be fused.
-#if defined(PADDLE_WITH_XPU_BKCL)
-  // TODO(liuyuhui): XPU don't support fuse all reduce for now
-  if (InputIsInDifferentPlace(in_var_handles) || true) {
-#else
+  // Note: some gradient op doesn't have CUDAKernel or XPUKernel, so the
+  // gradients of those op are in CPUPlace, in this case, the all reduce
+  // should not be fused.
  if (InputIsInDifferentPlace(in_var_handles)) {
-#endif
    for (size_t j = 0; j < num_of_all_reduce_; ++j) {
      std::vector<VarHandle *> dev_inputs;
      std::vector<VarHandle *> dev_outputs;

--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -299,6 +299,16 @@ REGISTER_OP_CUDA_KERNEL(
    ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, double>);
 #endif

+#ifdef PADDLE_WITH_XPU
+REGISTER_OP_XPU_KERNEL(
+    coalesce_tensor,
+    ops::CoalesceTensorOpKernel<paddle::platform::XPUDeviceContext,
+                                plat::float16>,
+    ops::CoalesceTensorOpKernel<paddle::platform::XPUDeviceContext, int>,
+    ops::CoalesceTensorOpKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::CoalesceTensorOpKernel<paddle::platform::XPUDeviceContext, double>);
+#endif
+
 REGISTER_OP_VERSION(coalesce_tensor)
    .AddCheckpoint(
        R"ROC(

--- a/paddle/fluid/platform/device_memory_aligment.cc
+++ b/paddle/fluid/platform/device_memory_aligment.cc
@@ -23,6 +23,9 @@ size_t Alignment(size_t size, const platform::Place &place) {
  } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    alignment = GpuMinChunkSize();
+#elif defined(PADDLE_WITH_XPU)
+    // TODO(wangxi): add XpuMinChunkSize
+    alignment = alignment;
 #else
    PADDLE_THROW(platform::errors::PreconditionNotMet(
        "Fluid is not compiled with CUDA."));

--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -192,7 +192,6 @@ class TestParallelExecutorBase(unittest.TestCase):
            build_strategy.fuse_elewise_add_act_ops = False
            build_strategy.fuse_relu_depthwise_conv = False
            build_strategy.fuse_all_optimizer_ops = False
-            build_strategy.fuse_all_reduce_ops = False
            build_strategy.memory_optimize = False
            build_strategy.enable_inplace = False
            build_strategy.enable_sequential_execution = False

--- a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
@@ -22,6 +22,8 @@ import paddle
 import unittest
 import os

+paddle.enable_static()
+

 class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
    @classmethod
@@ -37,6 +39,8 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
                                    fuse_all_optimizer_ops=False):
        if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
            return
+        if use_device == DeviceType.XPU and not core.is_compiled_with_xpu():
+            return

        feed_dict_data = None
        if init_feed_dict is not None:
@@ -83,11 +87,15 @@ class TestFuseAllReduceOps(TestFuseAllReduceOpsBase):

    def test_simple_fc_with_fuse_all_reduce(self):
        self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CUDA)
+        self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.XPU)
        self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CPU)

    def test_batchnorm_fc_with_fuse_all_reduce(self):
        self._decorate_compare_fused_all_reduce(fc_with_batchnorm,
                                                DeviceType.CUDA)
+        # TODO(wangxi): xpu batch_norm op only support dim = 4
+        # self._decorate_compare_fused_all_reduce(fc_with_batchnorm,
+        #                                         DeviceType.XPU)
        self._decorate_compare_fused_all_reduce(fc_with_batchnorm,
                                                DeviceType.CPU)

@@ -127,6 +135,8 @@ class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase):
    def test_simple_bow_net_with_fuse_all_reduce(self):
        model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True)
        self._decorate_compare_fused_all_reduce(model, DeviceType.CUDA)
+        # TODO(wangxi): xpu sum op only support LodTensor for now
+        # self._decorate_compare_fused_all_reduce(model, DeviceType.XPU)
        self._decorate_compare_fused_all_reduce(model, DeviceType.CPU)