From b8bce682e00fa122111bf1d2fdd55584e41d82b9 Mon Sep 17 00:00:00 2001 From: WangXi Date: Fri, 26 Feb 2021 15:33:32 +0800 Subject: [PATCH] xpu support fuse allreduce (#31104) --- .../framework/details/fused_all_reduce_op_handle.cc | 10 +++------- paddle/fluid/operators/coalesce_tensor_op.cc | 10 ++++++++++ paddle/fluid/platform/device_memory_aligment.cc | 3 +++ .../tests/unittests/parallel_executor_test_base.py | 1 - .../fluid/tests/unittests/test_fuse_all_reduce_pass.py | 10 ++++++++++ 5 files changed, 26 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc index f792f7f8963..8f45c364476 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -76,14 +76,10 @@ void FusedAllReduceOpHandle::RunImpl() { "handles is %d, and the number of output variable handles is %d.", in_var_handles.size(), out_var_handles.size())); -// Note: some gradient op doesn't have CUDAKernel, so the gradients of -// those op are in CPUPlace, in this case, the all reduce should not be fused. -#if defined(PADDLE_WITH_XPU_BKCL) - // TODO(liuyuhui): XPU don't support fuse all reduce for now - if (InputIsInDifferentPlace(in_var_handles) || true) { -#else + // Note: some gradient op doesn't have CUDAKernel or XPUKernel, so the + // gradients of those op are in CPUPlace, in this case, the all reduce + // should not be fused. if (InputIsInDifferentPlace(in_var_handles)) { -#endif for (size_t j = 0; j < num_of_all_reduce_; ++j) { std::vector dev_inputs; std::vector dev_outputs; diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc index 464d8c8d56f..ad255b18826 100644 --- a/paddle/fluid/operators/coalesce_tensor_op.cc +++ b/paddle/fluid/operators/coalesce_tensor_op.cc @@ -299,6 +299,16 @@ REGISTER_OP_CUDA_KERNEL( ops::CoalesceTensorOpKernel); #endif +#ifdef PADDLE_WITH_XPU +REGISTER_OP_XPU_KERNEL( + coalesce_tensor, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel); +#endif + REGISTER_OP_VERSION(coalesce_tensor) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/platform/device_memory_aligment.cc b/paddle/fluid/platform/device_memory_aligment.cc index b287d11a9fe..f8e03110441 100644 --- a/paddle/fluid/platform/device_memory_aligment.cc +++ b/paddle/fluid/platform/device_memory_aligment.cc @@ -23,6 +23,9 @@ size_t Alignment(size_t size, const platform::Place &place) { } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) alignment = GpuMinChunkSize(); +#elif defined(PADDLE_WITH_XPU) + // TODO(wangxi): add XpuMinChunkSize + alignment = alignment; #else PADDLE_THROW(platform::errors::PreconditionNotMet( "Fluid is not compiled with CUDA.")); diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 47f5c5085a0..2a8f72c2170 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -192,7 +192,6 @@ class TestParallelExecutorBase(unittest.TestCase): build_strategy.fuse_elewise_add_act_ops = False build_strategy.fuse_relu_depthwise_conv = False build_strategy.fuse_all_optimizer_ops = False - build_strategy.fuse_all_reduce_ops = False build_strategy.memory_optimize = False build_strategy.enable_inplace = False build_strategy.enable_sequential_execution = False diff --git a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py index 881b9d90579..e3a25661337 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py @@ -22,6 +22,8 @@ import paddle import unittest import os +paddle.enable_static() + class TestFuseAllReduceOpsBase(TestParallelExecutorBase): @classmethod @@ -37,6 +39,8 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase): fuse_all_optimizer_ops=False): if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return + if use_device == DeviceType.XPU and not core.is_compiled_with_xpu(): + return feed_dict_data = None if init_feed_dict is not None: @@ -83,11 +87,15 @@ class TestFuseAllReduceOps(TestFuseAllReduceOpsBase): def test_simple_fc_with_fuse_all_reduce(self): self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CUDA) + self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.XPU) self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CPU) def test_batchnorm_fc_with_fuse_all_reduce(self): self._decorate_compare_fused_all_reduce(fc_with_batchnorm, DeviceType.CUDA) + # TODO(wangxi): xpu batch_norm op only support dim = 4 + # self._decorate_compare_fused_all_reduce(fc_with_batchnorm, + # DeviceType.XPU) self._decorate_compare_fused_all_reduce(fc_with_batchnorm, DeviceType.CPU) @@ -127,6 +135,8 @@ class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase): def test_simple_bow_net_with_fuse_all_reduce(self): model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True) self._decorate_compare_fused_all_reduce(model, DeviceType.CUDA) + # TODO(wangxi): xpu sum op only support LodTensor for now + # self._decorate_compare_fused_all_reduce(model, DeviceType.XPU) self._decorate_compare_fused_all_reduce(model, DeviceType.CPU) -- GitLab