未验证 提交 b8bce682 编写于 作者: W WangXi 提交者: GitHub

xpu support fuse allreduce (#31104)

上级 59b00e8c
......@@ -76,14 +76,10 @@ void FusedAllReduceOpHandle::RunImpl() {
"handles is %d, and the number of output variable handles is %d.",
in_var_handles.size(), out_var_handles.size()));
// Note: some gradient op doesn't have CUDAKernel, so the gradients of
// those op are in CPUPlace, in this case, the all reduce should not be fused.
#if defined(PADDLE_WITH_XPU_BKCL)
// TODO(liuyuhui): XPU don't support fuse all reduce for now
if (InputIsInDifferentPlace(in_var_handles) || true) {
#else
// Note: some gradient op doesn't have CUDAKernel or XPUKernel, so the
// gradients of those op are in CPUPlace, in this case, the all reduce
// should not be fused.
if (InputIsInDifferentPlace(in_var_handles)) {
#endif
for (size_t j = 0; j < num_of_all_reduce_; ++j) {
std::vector<VarHandle *> dev_inputs;
std::vector<VarHandle *> dev_outputs;
......
......@@ -299,6 +299,16 @@ REGISTER_OP_CUDA_KERNEL(
ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, double>);
#endif
#ifdef PADDLE_WITH_XPU
REGISTER_OP_XPU_KERNEL(
coalesce_tensor,
ops::CoalesceTensorOpKernel<paddle::platform::XPUDeviceContext,
plat::float16>,
ops::CoalesceTensorOpKernel<paddle::platform::XPUDeviceContext, int>,
ops::CoalesceTensorOpKernel<paddle::platform::XPUDeviceContext, float>,
ops::CoalesceTensorOpKernel<paddle::platform::XPUDeviceContext, double>);
#endif
REGISTER_OP_VERSION(coalesce_tensor)
.AddCheckpoint(
R"ROC(
......
......@@ -23,6 +23,9 @@ size_t Alignment(size_t size, const platform::Place &place) {
} else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
alignment = GpuMinChunkSize();
#elif defined(PADDLE_WITH_XPU)
// TODO(wangxi): add XpuMinChunkSize
alignment = alignment;
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Fluid is not compiled with CUDA."));
......
......@@ -192,7 +192,6 @@ class TestParallelExecutorBase(unittest.TestCase):
build_strategy.fuse_elewise_add_act_ops = False
build_strategy.fuse_relu_depthwise_conv = False
build_strategy.fuse_all_optimizer_ops = False
build_strategy.fuse_all_reduce_ops = False
build_strategy.memory_optimize = False
build_strategy.enable_inplace = False
build_strategy.enable_sequential_execution = False
......
......@@ -22,6 +22,8 @@ import paddle
import unittest
import os
paddle.enable_static()
class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
@classmethod
......@@ -37,6 +39,8 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
fuse_all_optimizer_ops=False):
if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return
if use_device == DeviceType.XPU and not core.is_compiled_with_xpu():
return
feed_dict_data = None
if init_feed_dict is not None:
......@@ -83,11 +87,15 @@ class TestFuseAllReduceOps(TestFuseAllReduceOpsBase):
def test_simple_fc_with_fuse_all_reduce(self):
self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CUDA)
self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.XPU)
self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CPU)
def test_batchnorm_fc_with_fuse_all_reduce(self):
self._decorate_compare_fused_all_reduce(fc_with_batchnorm,
DeviceType.CUDA)
# TODO(wangxi): xpu batch_norm op only support dim = 4
# self._decorate_compare_fused_all_reduce(fc_with_batchnorm,
# DeviceType.XPU)
self._decorate_compare_fused_all_reduce(fc_with_batchnorm,
DeviceType.CPU)
......@@ -127,6 +135,8 @@ class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase):
def test_simple_bow_net_with_fuse_all_reduce(self):
model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True)
self._decorate_compare_fused_all_reduce(model, DeviceType.CUDA)
# TODO(wangxi): xpu sum op only support LodTensor for now
# self._decorate_compare_fused_all_reduce(model, DeviceType.XPU)
self._decorate_compare_fused_all_reduce(model, DeviceType.CPU)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册