diff --git a/paddle/fluid/operators/reduce_op.h b/paddle/fluid/operators/reduce_op.h index 735ad3af2afb6d70f05bc6c5f99b6e317c347923..72b6cf1773d5bcc42e40e72111179d454d2bb4a9 100644 --- a/paddle/fluid/operators/reduce_op.h +++ b/paddle/fluid/operators/reduce_op.h @@ -88,35 +88,6 @@ class ReduceGradKernel : public framework::OpKernel { auto* output = context.Output(framework::GradVarName("X")); output->mutable_data(context.GetPlace()); - if (context.GetPlace().type() == typeid(platform::CPUPlace)) { - const auto* input2_d = input2->data(); - auto* output_d = output->data(); - - // CPU reduce_all_grad - if (reduce_all) { - PADDLE_ENFORCE(input2->dims().size() == 1 && input2->dims()[0] == 1, - "output should be a scalar"); - for (int64_t i = 0; i < framework::product(input0->dims()); ++i) { - output_d[i] = input2_d[0]; - } - return; - } - - if (input0->dims().size() == 2 && dims.size() == 1) { - auto& input_dim = input0->dims(); - for (int64_t i = 0; i < input_dim[0]; ++i) { - for (int64_t j = 0; j < input_dim[1]; ++j) { - if (dims[0] == 0) { - output_d[i * input_dim[1] + j] = input2_d[j]; - } else { - output_d[i * input_dim[1] + j] = input2_d[i]; - } - } - } - return; - } - } - if (reduce_all) { auto x = EigenVector::Flatten(*input0); auto x_reduce = EigenVector::From(*input1); diff --git a/paddle/fluid/operators/reduce_sum_op.cc b/paddle/fluid/operators/reduce_sum_op.cc index c5b5398787b44e658b0f8390162df0e6c3006651..f0e5f6580fbc9e70562cb2fdd7e0c5d8729bc9a7 100644 --- a/paddle/fluid/operators/reduce_sum_op.cc +++ b/paddle/fluid/operators/reduce_sum_op.cc @@ -23,12 +23,13 @@ REGISTER_OP_CPU_KERNEL( ops::ReduceKernel, ops::ReduceKernel); -REGISTER_OP_CPU_KERNEL(reduce_sum_grad, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); +REGISTER_OP_CPU_KERNEL( + reduce_sum_grad, + ops::ReduceSumGradKernel, + ops::ReduceSumGradKernel, + ops::ReduceSumGradKernel, + ops::ReduceSumGradKernel); diff --git a/paddle/fluid/operators/reduce_sum_op.h b/paddle/fluid/operators/reduce_sum_op.h index 248782ce9731657a7796c0bb96e1e34e108a6493..3e8d1bbdba504669bc06e0637094e3bee840adf2 100644 --- a/paddle/fluid/operators/reduce_sum_op.h +++ b/paddle/fluid/operators/reduce_sum_op.h @@ -14,11 +14,69 @@ #pragma once +#include + #include "paddle/fluid/operators/reduce_op.h" namespace paddle { namespace operators { +// use for loop to speed up Eigen broadcast. 4 timer faster then broadcast +template +class ReduceSumGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto dims = context.Attr>("dim"); + if (context.GetPlace().type() == typeid(platform::CPUPlace) && + dims.size() == 1) { + auto* input0 = context.Input("X"); + auto* input2 = context.Input(framework::GradVarName("Out")); + auto* output = context.Output(framework::GradVarName("X")); + output->mutable_data(context.GetPlace()); + const auto* input2_d = input2->data(); + auto* output_d = output->data(); + + // handle reduce_all + if (input2->dims().size() == 1 && input2->dims()[0] == 1) { + for (int64_t i = 0; i < framework::product(input0->dims()); ++i) { + output_d[i] = input2_d[0]; + } + return; + } + + // handle reduce by one dimension + int reduce_dim_index = dims[0]; + if (reduce_dim_index < 0) { + reduce_dim_index += input0->dims().size(); + } + + auto& input_dim = input0->dims(); + int64_t before_dim = 1; + for (int i = 0; i < reduce_dim_index; ++i) { + before_dim *= input_dim[i]; + } + int64_t reduce_dim = input_dim[reduce_dim_index]; + int64_t after_dim = 1; + for (int i = reduce_dim_index + 1; i < input_dim.size(); ++i) { + after_dim *= input_dim[i]; + } + for (int64_t i = 0; i < before_dim; ++i) { + for (int64_t j = 0; j < reduce_dim; ++j) { + for (int64_t k = 0; k < after_dim; ++k) { + output_d[i * reduce_dim * after_dim + j * after_dim + k] = + input2_d[i * after_dim + k]; + } + } + } + return; + } + + // default use Eigen broadcast + ReduceGradKernel kernel; + kernel.Compute(context); + } +}; + struct SumFunctor { template void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py index dbc28926460eb243cb108911c13546ae96917935..06d116601bf733986ccf9c725340456ab1258be2 100644 --- a/python/paddle/fluid/tests/unittests/test_reduce_op.py +++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py @@ -115,14 +115,56 @@ class Test2DReduce1(Test1DReduce): self.op_type = "reduce_sum" self.attrs = {'dim': [1]} self.inputs = {'X': np.random.random((20, 10)).astype("float64")} - self.outputs = {'Out': self.inputs['X'].sum(axis=1)} + self.outputs = { + 'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim'])) + } + + +class Test3DReduce0(Test1DReduce): + def setUp(self): + self.op_type = "reduce_sum" + self.attrs = {'dim': [1]} + self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")} + self.outputs = { + 'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim'])) + } + + +class Test3DReduce1(Test1DReduce): + def setUp(self): + self.op_type = "reduce_sum" + self.attrs = {'dim': [2]} + self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")} + self.outputs = { + 'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim'])) + } + + +class Test3DReduce2(Test1DReduce): + def setUp(self): + self.op_type = "reduce_sum" + self.attrs = {'dim': [-2]} + self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")} + self.outputs = { + 'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim'])) + } + + +class Test3DReduce3(Test1DReduce): + def setUp(self): + self.op_type = "reduce_sum" + self.attrs = {'dim': [1, 2]} + self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")} + self.outputs = { + 'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim'])) + } class TestKeepDimReduce(Test1DReduce): def setUp(self): self.op_type = "reduce_sum" self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")} - self.attrs = {'dim': [-2], 'keep_dim': True} + self.attrs = {'dim': [1], 'keep_dim': True} self.outputs = { 'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']), keepdims=self.attrs['keep_dim'])