diff --git a/paddle/fluid/operators/reduce_sum_op.cc b/paddle/fluid/operators/reduce_sum_op.cc index c5b5398787b44e658b0f8390162df0e6c3006651..f0e5f6580fbc9e70562cb2fdd7e0c5d8729bc9a7 100644 --- a/paddle/fluid/operators/reduce_sum_op.cc +++ b/paddle/fluid/operators/reduce_sum_op.cc @@ -23,12 +23,13 @@ REGISTER_OP_CPU_KERNEL( ops::ReduceKernel, ops::ReduceKernel); -REGISTER_OP_CPU_KERNEL(reduce_sum_grad, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); +REGISTER_OP_CPU_KERNEL( + reduce_sum_grad, + ops::ReduceSumGradKernel, + ops::ReduceSumGradKernel, + ops::ReduceSumGradKernel, + ops::ReduceSumGradKernel); diff --git a/paddle/fluid/operators/reduce_sum_op.h b/paddle/fluid/operators/reduce_sum_op.h index e67d7e1da5f0244d2dee346873692a80cbad2fc4..3e8d1bbdba504669bc06e0637094e3bee840adf2 100644 --- a/paddle/fluid/operators/reduce_sum_op.h +++ b/paddle/fluid/operators/reduce_sum_op.h @@ -14,11 +14,69 @@ #pragma once +#include + #include "paddle/fluid/operators/reduce_op.h" namespace paddle { namespace operators { +// use for loop to speed up Eigen broadcast. 4 timer faster then broadcast +template +class ReduceSumGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto dims = context.Attr>("dim"); + if (context.GetPlace().type() == typeid(platform::CPUPlace) && + dims.size() == 1) { + auto* input0 = context.Input("X"); + auto* input2 = context.Input(framework::GradVarName("Out")); + auto* output = context.Output(framework::GradVarName("X")); + output->mutable_data(context.GetPlace()); + const auto* input2_d = input2->data(); + auto* output_d = output->data(); + + // handle reduce_all + if (input2->dims().size() == 1 && input2->dims()[0] == 1) { + for (int64_t i = 0; i < framework::product(input0->dims()); ++i) { + output_d[i] = input2_d[0]; + } + return; + } + + // handle reduce by one dimension + int reduce_dim_index = dims[0]; + if (reduce_dim_index < 0) { + reduce_dim_index += input0->dims().size(); + } + + auto& input_dim = input0->dims(); + int64_t before_dim = 1; + for (int i = 0; i < reduce_dim_index; ++i) { + before_dim *= input_dim[i]; + } + int64_t reduce_dim = input_dim[reduce_dim_index]; + int64_t after_dim = 1; + for (int i = reduce_dim_index + 1; i < input_dim.size(); ++i) { + after_dim *= input_dim[i]; + } + for (int64_t i = 0; i < before_dim; ++i) { + for (int64_t j = 0; j < reduce_dim; ++j) { + for (int64_t k = 0; k < after_dim; ++k) { + output_d[i * reduce_dim * after_dim + j * after_dim + k] = + input2_d[i * after_dim + k]; + } + } + } + return; + } + + // default use Eigen broadcast + ReduceGradKernel kernel; + kernel.Compute(context); + } +}; + struct SumFunctor { template void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { @@ -31,7 +89,7 @@ struct SumGradFunctor { typename DY, typename Dim> void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy, const Dim& dim, int size) { - dx->device(place) = dy->broadcast(dim); + dx->device(place) = dy->eval().broadcast(dim); } }; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index ab40d0c217f565493b30d9a4cb3a600863122bc7..4df806216aa3ecf03e81a491a1ffbb4cc47d7b50 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -2961,7 +2961,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None): # x is a Tensor variable with following elements: # [[0.2, 0.3, 0.5, 0.9] # [0.1, 0.2, 0.6, 0.7]] - # Each example is followed by the correspending output tensor. + # Each example is followed by the corresponding output tensor. fluid.layers.reduce_sum(x) # [3.5] fluid.layers.reduce_sum(x, dim=0) # [0.3, 0.5, 1.1, 1.6] fluid.layers.reduce_sum(x, dim=-1) # [1.9, 1.6] @@ -2970,7 +2970,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None): # x is a Tensor variable with shape [2, 2, 2] and elements as below: # [[[1, 2], [3, 4]], # [[5, 6], [7, 8]]] - # Each example is followed by the correspending output tensor. + # Each example is followed by the corresponding output tensor. fluid.layers.reduce_sum(x, dim=[1, 2]) # [10, 26] fluid.layers.reduce_sum(x, dim=[0, 1]) # [16, 20] diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py index 865c2b7df085aa6a6cb0d6eb461c342ce08695cd..06d116601bf733986ccf9c725340456ab1258be2 100644 --- a/python/paddle/fluid/tests/unittests/test_reduce_op.py +++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py @@ -89,15 +89,11 @@ class TestProdOp(OpTest): self.check_grad(['X'], 'Out') -class TestKeepDimReduce(OpTest): +class Test1DReduce(OpTest): def setUp(self): self.op_type = "reduce_sum" - self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")} - self.attrs = {'dim': [-2], 'keep_dim': True} - self.outputs = { - 'Out': - self.inputs['X'].sum(axis=tuple(self.attrs['dim']), keepdims=True) - } + self.inputs = {'X': np.random.random(20).astype("float64")} + self.outputs = {'Out': self.inputs['X'].sum(axis=0)} def test_check_output(self): self.check_output() @@ -106,32 +102,82 @@ class TestKeepDimReduce(OpTest): self.check_grad(['X'], 'Out') -class Test1DReduce(OpTest): +class Test2DReduce0(Test1DReduce): def setUp(self): self.op_type = "reduce_sum" - self.inputs = {'X': np.random.random(20).astype("float64")} + self.attrs = {'dim': [0]} + self.inputs = {'X': np.random.random((20, 10)).astype("float64")} self.outputs = {'Out': self.inputs['X'].sum(axis=0)} - def test_check_output(self): - self.check_output() - def test_check_grad(self): - self.check_grad(['X'], 'Out') +class Test2DReduce1(Test1DReduce): + def setUp(self): + self.op_type = "reduce_sum" + self.attrs = {'dim': [1]} + self.inputs = {'X': np.random.random((20, 10)).astype("float64")} + self.outputs = { + 'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim'])) + } -class TestReduceAll(OpTest): +class Test3DReduce0(Test1DReduce): + def setUp(self): + self.op_type = "reduce_sum" + self.attrs = {'dim': [1]} + self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")} + self.outputs = { + 'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim'])) + } + + +class Test3DReduce1(Test1DReduce): + def setUp(self): + self.op_type = "reduce_sum" + self.attrs = {'dim': [2]} + self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")} + self.outputs = { + 'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim'])) + } + + +class Test3DReduce2(Test1DReduce): + def setUp(self): + self.op_type = "reduce_sum" + self.attrs = {'dim': [-2]} + self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")} + self.outputs = { + 'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim'])) + } + + +class Test3DReduce3(Test1DReduce): + def setUp(self): + self.op_type = "reduce_sum" + self.attrs = {'dim': [1, 2]} + self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")} + self.outputs = { + 'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim'])) + } + + +class TestKeepDimReduce(Test1DReduce): + def setUp(self): + self.op_type = "reduce_sum" + self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")} + self.attrs = {'dim': [1], 'keep_dim': True} + self.outputs = { + 'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']), + keepdims=self.attrs['keep_dim']) + } + + +class TestReduceAll(Test1DReduce): def setUp(self): self.op_type = "reduce_sum" self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")} self.attrs = {'reduce_all': True} self.outputs = {'Out': self.inputs['X'].sum()} - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['X'], 'Out') - ## reduction in multi dims class TestReduceMeanOpMultiAxises(OpTest):