diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
index f792f7f8963e068068a82362dcad1dde4d5a25dc..8f45c364476a7539553f1ccc84e8c8c7650567cb 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -76,14 +76,10 @@ void FusedAllReduceOpHandle::RunImpl() {
           "handles is %d, and the number of  output variable handles is %d.",
           in_var_handles.size(), out_var_handles.size()));
 
-// Note: some gradient op doesn't have CUDAKernel, so the gradients of
-// those op are in CPUPlace, in this case, the all reduce should not be fused.
-#if defined(PADDLE_WITH_XPU_BKCL)
-  // TODO(liuyuhui): XPU don't support fuse all reduce for now
-  if (InputIsInDifferentPlace(in_var_handles) || true) {
-#else
+  // Note: some gradient op doesn't have CUDAKernel or XPUKernel, so the
+  // gradients of those op are in CPUPlace, in this case, the all reduce
+  // should not be fused.
   if (InputIsInDifferentPlace(in_var_handles)) {
-#endif
     for (size_t j = 0; j < num_of_all_reduce_; ++j) {
       std::vector<VarHandle *> dev_inputs;
       std::vector<VarHandle *> dev_outputs;
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index 464d8c8d56f5c425706c01ea01d14f7ac2aed0ab..ad255b188265dea9869c1f2d397b407003e61877 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -299,6 +299,16 @@ REGISTER_OP_CUDA_KERNEL(
     ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, double>);
 #endif
 
+#ifdef PADDLE_WITH_XPU
+REGISTER_OP_XPU_KERNEL(
+    coalesce_tensor,
+    ops::CoalesceTensorOpKernel<paddle::platform::XPUDeviceContext,
+                                plat::float16>,
+    ops::CoalesceTensorOpKernel<paddle::platform::XPUDeviceContext, int>,
+    ops::CoalesceTensorOpKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::CoalesceTensorOpKernel<paddle::platform::XPUDeviceContext, double>);
+#endif
+
 REGISTER_OP_VERSION(coalesce_tensor)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/platform/device_memory_aligment.cc b/paddle/fluid/platform/device_memory_aligment.cc
index b287d11a9fe6291c9d4a7a84ab4cbab33859347f..f8e031104415e848101d97d2f66217847630c923 100644
--- a/paddle/fluid/platform/device_memory_aligment.cc
+++ b/paddle/fluid/platform/device_memory_aligment.cc
@@ -23,6 +23,9 @@ size_t Alignment(size_t size, const platform::Place &place) {
   } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     alignment = GpuMinChunkSize();
+#elif defined(PADDLE_WITH_XPU)
+    // TODO(wangxi): add XpuMinChunkSize
+    alignment = alignment;
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "Fluid is not compiled with CUDA."));
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 47f5c5085a027a6f0831cc1de51223e821059257..2a8f72c217055b0166414ab2672b602e08907612 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -192,7 +192,6 @@ class TestParallelExecutorBase(unittest.TestCase):
             build_strategy.fuse_elewise_add_act_ops = False
             build_strategy.fuse_relu_depthwise_conv = False
             build_strategy.fuse_all_optimizer_ops = False
-            build_strategy.fuse_all_reduce_ops = False
             build_strategy.memory_optimize = False
             build_strategy.enable_inplace = False
             build_strategy.enable_sequential_execution = False
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
index 881b9d905799f241931a20227b998ca10b8b35c0..e3a256613374213bdb80b055171757944b4c0c1a 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
@@ -22,6 +22,8 @@ import paddle
 import unittest
 import os
 
+paddle.enable_static()
+
 
 class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
     @classmethod
@@ -37,6 +39,8 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
                                     fuse_all_optimizer_ops=False):
         if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
             return
+        if use_device == DeviceType.XPU and not core.is_compiled_with_xpu():
+            return
 
         feed_dict_data = None
         if init_feed_dict is not None:
@@ -83,11 +87,15 @@ class TestFuseAllReduceOps(TestFuseAllReduceOpsBase):
 
     def test_simple_fc_with_fuse_all_reduce(self):
         self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CUDA)
+        self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.XPU)
         self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CPU)
 
     def test_batchnorm_fc_with_fuse_all_reduce(self):
         self._decorate_compare_fused_all_reduce(fc_with_batchnorm,
                                                 DeviceType.CUDA)
+        # TODO(wangxi): xpu batch_norm op only support dim = 4
+        # self._decorate_compare_fused_all_reduce(fc_with_batchnorm,
+        #                                         DeviceType.XPU)
         self._decorate_compare_fused_all_reduce(fc_with_batchnorm,
                                                 DeviceType.CPU)
 
@@ -127,6 +135,8 @@ class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase):
     def test_simple_bow_net_with_fuse_all_reduce(self):
         model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True)
         self._decorate_compare_fused_all_reduce(model, DeviceType.CUDA)
+        # TODO(wangxi): xpu sum op only support LodTensor for now
+        # self._decorate_compare_fused_all_reduce(model, DeviceType.XPU)
         self._decorate_compare_fused_all_reduce(model, DeviceType.CPU)