未验证 提交 b8bce682 编写于 作者: W WangXi 提交者: GitHub

xpu support fuse allreduce (#31104)

上级 59b00e8c
...@@ -76,14 +76,10 @@ void FusedAllReduceOpHandle::RunImpl() { ...@@ -76,14 +76,10 @@ void FusedAllReduceOpHandle::RunImpl() {
"handles is %d, and the number of output variable handles is %d.", "handles is %d, and the number of output variable handles is %d.",
in_var_handles.size(), out_var_handles.size())); in_var_handles.size(), out_var_handles.size()));
// Note: some gradient op doesn't have CUDAKernel, so the gradients of // Note: some gradient op doesn't have CUDAKernel or XPUKernel, so the
// those op are in CPUPlace, in this case, the all reduce should not be fused. // gradients of those op are in CPUPlace, in this case, the all reduce
#if defined(PADDLE_WITH_XPU_BKCL) // should not be fused.
// TODO(liuyuhui): XPU don't support fuse all reduce for now
if (InputIsInDifferentPlace(in_var_handles) || true) {
#else
if (InputIsInDifferentPlace(in_var_handles)) { if (InputIsInDifferentPlace(in_var_handles)) {
#endif
for (size_t j = 0; j < num_of_all_reduce_; ++j) { for (size_t j = 0; j < num_of_all_reduce_; ++j) {
std::vector<VarHandle *> dev_inputs; std::vector<VarHandle *> dev_inputs;
std::vector<VarHandle *> dev_outputs; std::vector<VarHandle *> dev_outputs;
......
...@@ -299,6 +299,16 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -299,6 +299,16 @@ REGISTER_OP_CUDA_KERNEL(
ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, double>); ops::CoalesceTensorOpKernel<paddle::platform::CUDADeviceContext, double>);
#endif #endif
#ifdef PADDLE_WITH_XPU
REGISTER_OP_XPU_KERNEL(
coalesce_tensor,
ops::CoalesceTensorOpKernel<paddle::platform::XPUDeviceContext,
plat::float16>,
ops::CoalesceTensorOpKernel<paddle::platform::XPUDeviceContext, int>,
ops::CoalesceTensorOpKernel<paddle::platform::XPUDeviceContext, float>,
ops::CoalesceTensorOpKernel<paddle::platform::XPUDeviceContext, double>);
#endif
REGISTER_OP_VERSION(coalesce_tensor) REGISTER_OP_VERSION(coalesce_tensor)
.AddCheckpoint( .AddCheckpoint(
R"ROC( R"ROC(
......
...@@ -23,6 +23,9 @@ size_t Alignment(size_t size, const platform::Place &place) { ...@@ -23,6 +23,9 @@ size_t Alignment(size_t size, const platform::Place &place) {
} else { } else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
alignment = GpuMinChunkSize(); alignment = GpuMinChunkSize();
#elif defined(PADDLE_WITH_XPU)
// TODO(wangxi): add XpuMinChunkSize
alignment = alignment;
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"Fluid is not compiled with CUDA.")); "Fluid is not compiled with CUDA."));
......
...@@ -192,7 +192,6 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -192,7 +192,6 @@ class TestParallelExecutorBase(unittest.TestCase):
build_strategy.fuse_elewise_add_act_ops = False build_strategy.fuse_elewise_add_act_ops = False
build_strategy.fuse_relu_depthwise_conv = False build_strategy.fuse_relu_depthwise_conv = False
build_strategy.fuse_all_optimizer_ops = False build_strategy.fuse_all_optimizer_ops = False
build_strategy.fuse_all_reduce_ops = False
build_strategy.memory_optimize = False build_strategy.memory_optimize = False
build_strategy.enable_inplace = False build_strategy.enable_inplace = False
build_strategy.enable_sequential_execution = False build_strategy.enable_sequential_execution = False
......
...@@ -22,6 +22,8 @@ import paddle ...@@ -22,6 +22,8 @@ import paddle
import unittest import unittest
import os import os
paddle.enable_static()
class TestFuseAllReduceOpsBase(TestParallelExecutorBase): class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
@classmethod @classmethod
...@@ -37,6 +39,8 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase): ...@@ -37,6 +39,8 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
fuse_all_optimizer_ops=False): fuse_all_optimizer_ops=False):
if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
return return
if use_device == DeviceType.XPU and not core.is_compiled_with_xpu():
return
feed_dict_data = None feed_dict_data = None
if init_feed_dict is not None: if init_feed_dict is not None:
...@@ -83,11 +87,15 @@ class TestFuseAllReduceOps(TestFuseAllReduceOpsBase): ...@@ -83,11 +87,15 @@ class TestFuseAllReduceOps(TestFuseAllReduceOpsBase):
def test_simple_fc_with_fuse_all_reduce(self): def test_simple_fc_with_fuse_all_reduce(self):
self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CUDA) self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CUDA)
self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.XPU)
self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CPU) self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CPU)
def test_batchnorm_fc_with_fuse_all_reduce(self): def test_batchnorm_fc_with_fuse_all_reduce(self):
self._decorate_compare_fused_all_reduce(fc_with_batchnorm, self._decorate_compare_fused_all_reduce(fc_with_batchnorm,
DeviceType.CUDA) DeviceType.CUDA)
# TODO(wangxi): xpu batch_norm op only support dim = 4
# self._decorate_compare_fused_all_reduce(fc_with_batchnorm,
# DeviceType.XPU)
self._decorate_compare_fused_all_reduce(fc_with_batchnorm, self._decorate_compare_fused_all_reduce(fc_with_batchnorm,
DeviceType.CPU) DeviceType.CPU)
...@@ -127,6 +135,8 @@ class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase): ...@@ -127,6 +135,8 @@ class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase):
def test_simple_bow_net_with_fuse_all_reduce(self): def test_simple_bow_net_with_fuse_all_reduce(self):
model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True) model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True)
self._decorate_compare_fused_all_reduce(model, DeviceType.CUDA) self._decorate_compare_fused_all_reduce(model, DeviceType.CUDA)
# TODO(wangxi): xpu sum op only support LodTensor for now
# self._decorate_compare_fused_all_reduce(model, DeviceType.XPU)
self._decorate_compare_fused_all_reduce(model, DeviceType.CPU) self._decorate_compare_fused_all_reduce(model, DeviceType.CPU)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册