From b8bce682e00fa122111bf1d2fdd55584e41d82b9 Mon Sep 17 00:00:00 2001
From: WangXi <wangxi16@baidu.com>
Date: Fri, 26 Feb 2021 15:33:32 +0800
Subject: [PATCH] xpu support fuse allreduce (#31104)

---
 .../framework/details/fused_all_reduce_op_handle.cc    | 10 +++-------
 paddle/fluid/operators/coalesce_tensor_op.cc           | 10 ++++++++++
 paddle/fluid/platform/device_memory_aligment.cc        |  3 +++
 .../tests/unittests/parallel_executor_test_base.py     |  1 -
 .../fluid/tests/unittests/test_fuse_all_reduce_pass.py | 10 ++++++++++
 5 files changed, 26 insertions(+), 8 deletions(-)
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
index f792f7f8963..8f45c364476 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -76,14 +76,10 @@ void FusedAllReduceOpHandle::RunImpl() {
           "handles is %d, and the number of  output variable handles is %d.",
           in_var_handles.size(), out_var_handles.size()));
 
-// Note: some gradient op doesn't have CUDAKernel, so the gradients of
-// those op are in CPUPlace, in this case, the all reduce should not be fused.
-#if defined(PADDLE_WITH_XPU_BKCL)
-  // TODO(liuyuhui): XPU don't support fuse all reduce for now
-  if (InputIsInDifferentPlace(in_var_handles) || true) {
-#else
+  // Note: some gradient op doesn't have CUDAKernel or XPUKernel, so the
+  // gradients of those op are in CPUPlace, in this case, the all reduce
+  // should not be fused.
   if (InputIsInDifferentPlace(in_var_handles)) {
-#endif
     for (size_t j = 0; j < num_of_all_reduce_; ++j) {
       std::vector<VarHandle *> dev_inputs;
       std::vector<VarHandle *> dev_outputs;
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index 464d8c8d56f..ad255b18826 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -299,6 +299,16 @@ REGISTER_OP_CUDA_KERNEL(
     ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, double>);
 #endif
 
+#ifdef PADDLE_WITH_XPU
+REGISTER_OP_XPU_KERNEL(
+    coalesce_tensor,
+    ops::CoalesceTensorOpKernel<paddle::platform::XPUDeviceContext,
+                                plat::float16>,
+    ops::CoalesceTensorOpKernel<paddle::platform::XPUDeviceContext, int>,
+    ops::CoalesceTensorOpKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::CoalesceTensorOpKernel<paddle::platform::XPUDeviceContext, double>);
+#endif
+
 REGISTER_OP_VERSION(coalesce_tensor)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/platform/device_memory_aligment.cc b/paddle/fluid/platform/device_memory_aligment.cc
index b287d11a9fe..f8e03110441 100644
--- a/paddle/fluid/platform/device_memory_aligment.cc
+++ b/paddle/fluid/platform/device_memory_aligment.cc
@@ -23,6 +23,9 @@ size_t Alignment(size_t size, const platform::Place &place) {
   } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     alignment = GpuMinChunkSize();
+#elif defined(PADDLE_WITH_XPU)
+    // TODO(wangxi): add XpuMinChunkSize
+    alignment = alignment;
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "Fluid is not compiled with CUDA."));
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 47f5c5085a0..2a8f72c2170 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -192,7 +192,6 @@ class TestParallelExecutorBase(unittest.TestCase):
             build_strategy.fuse_elewise_add_act_ops = False
             build_strategy.fuse_relu_depthwise_conv = False
             build_strategy.fuse_all_optimizer_ops = False
-            build_strategy.fuse_all_reduce_ops = False
             build_strategy.memory_optimize = False
             build_strategy.enable_inplace = False
             build_strategy.enable_sequential_execution = False
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
index 881b9d90579..e3a25661337 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
@@ -22,6 +22,8 @@ import paddle
 import unittest
 import os
 
+paddle.enable_static()
+
 
 class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
     @classmethod
@@ -37,6 +39,8 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
                                     fuse_all_optimizer_ops=False):
         if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
             return
+        if use_device == DeviceType.XPU and not core.is_compiled_with_xpu():
+            return
 
         feed_dict_data = None
         if init_feed_dict is not None:
@@ -83,11 +87,15 @@ class TestFuseAllReduceOps(TestFuseAllReduceOpsBase):
 
     def test_simple_fc_with_fuse_all_reduce(self):
         self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CUDA)
+        self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.XPU)
         self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CPU)
 
     def test_batchnorm_fc_with_fuse_all_reduce(self):
         self._decorate_compare_fused_all_reduce(fc_with_batchnorm,
                                                 DeviceType.CUDA)
+        # TODO(wangxi): xpu batch_norm op only support dim = 4
+        # self._decorate_compare_fused_all_reduce(fc_with_batchnorm,
+        #                                         DeviceType.XPU)
         self._decorate_compare_fused_all_reduce(fc_with_batchnorm,
                                                 DeviceType.CPU)
 
@@ -127,6 +135,8 @@ class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase):
     def test_simple_bow_net_with_fuse_all_reduce(self):
         model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True)
         self._decorate_compare_fused_all_reduce(model, DeviceType.CUDA)
+        # TODO(wangxi): xpu sum op only support LodTensor for now
+        # self._decorate_compare_fused_all_reduce(model, DeviceType.XPU)
         self._decorate_compare_fused_all_reduce(model, DeviceType.CPU)
 
 
-- 
GitLab