diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc index f792f7f8963e068068a82362dcad1dde4d5a25dc..8f45c364476a7539553f1ccc84e8c8c7650567cb 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -76,14 +76,10 @@ void FusedAllReduceOpHandle::RunImpl() { "handles is %d, and the number of output variable handles is %d.", in_var_handles.size(), out_var_handles.size())); -// Note: some gradient op doesn't have CUDAKernel, so the gradients of -// those op are in CPUPlace, in this case, the all reduce should not be fused. -#if defined(PADDLE_WITH_XPU_BKCL) - // TODO(liuyuhui): XPU don't support fuse all reduce for now - if (InputIsInDifferentPlace(in_var_handles) || true) { -#else + // Note: some gradient op doesn't have CUDAKernel or XPUKernel, so the + // gradients of those op are in CPUPlace, in this case, the all reduce + // should not be fused. if (InputIsInDifferentPlace(in_var_handles)) { -#endif for (size_t j = 0; j < num_of_all_reduce_; ++j) { std::vector dev_inputs; std::vector dev_outputs; diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc index 464d8c8d56f5c425706c01ea01d14f7ac2aed0ab..ad255b188265dea9869c1f2d397b407003e61877 100644 --- a/paddle/fluid/operators/coalesce_tensor_op.cc +++ b/paddle/fluid/operators/coalesce_tensor_op.cc @@ -299,6 +299,16 @@ REGISTER_OP_CUDA_KERNEL( ops::CoalesceTensorOpKernel); #endif +#ifdef PADDLE_WITH_XPU +REGISTER_OP_XPU_KERNEL( + coalesce_tensor, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel); +#endif + REGISTER_OP_VERSION(coalesce_tensor) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/platform/device_memory_aligment.cc b/paddle/fluid/platform/device_memory_aligment.cc index b287d11a9fe6291c9d4a7a84ab4cbab33859347f..f8e031104415e848101d97d2f66217847630c923 100644 --- a/paddle/fluid/platform/device_memory_aligment.cc +++ b/paddle/fluid/platform/device_memory_aligment.cc @@ -23,6 +23,9 @@ size_t Alignment(size_t size, const platform::Place &place) { } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) alignment = GpuMinChunkSize(); +#elif defined(PADDLE_WITH_XPU) + // TODO(wangxi): add XpuMinChunkSize + alignment = alignment; #else PADDLE_THROW(platform::errors::PreconditionNotMet( "Fluid is not compiled with CUDA.")); diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 47f5c5085a027a6f0831cc1de51223e821059257..2a8f72c217055b0166414ab2672b602e08907612 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -192,7 +192,6 @@ class TestParallelExecutorBase(unittest.TestCase): build_strategy.fuse_elewise_add_act_ops = False build_strategy.fuse_relu_depthwise_conv = False build_strategy.fuse_all_optimizer_ops = False - build_strategy.fuse_all_reduce_ops = False build_strategy.memory_optimize = False build_strategy.enable_inplace = False build_strategy.enable_sequential_execution = False diff --git a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py index 881b9d905799f241931a20227b998ca10b8b35c0..e3a256613374213bdb80b055171757944b4c0c1a 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py @@ -22,6 +22,8 @@ import paddle import unittest import os +paddle.enable_static() + class TestFuseAllReduceOpsBase(TestParallelExecutorBase): @classmethod @@ -37,6 +39,8 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase): fuse_all_optimizer_ops=False): if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return + if use_device == DeviceType.XPU and not core.is_compiled_with_xpu(): + return feed_dict_data = None if init_feed_dict is not None: @@ -83,11 +87,15 @@ class TestFuseAllReduceOps(TestFuseAllReduceOpsBase): def test_simple_fc_with_fuse_all_reduce(self): self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CUDA) + self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.XPU) self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CPU) def test_batchnorm_fc_with_fuse_all_reduce(self): self._decorate_compare_fused_all_reduce(fc_with_batchnorm, DeviceType.CUDA) + # TODO(wangxi): xpu batch_norm op only support dim = 4 + # self._decorate_compare_fused_all_reduce(fc_with_batchnorm, + # DeviceType.XPU) self._decorate_compare_fused_all_reduce(fc_with_batchnorm, DeviceType.CPU) @@ -127,6 +135,8 @@ class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase): def test_simple_bow_net_with_fuse_all_reduce(self): model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True) self._decorate_compare_fused_all_reduce(model, DeviceType.CUDA) + # TODO(wangxi): xpu sum op only support LodTensor for now + # self._decorate_compare_fused_all_reduce(model, DeviceType.XPU) self._decorate_compare_fused_all_reduce(model, DeviceType.CPU)