From 5d718a5886478a4d1349d9e85ad217c4d4970b5e Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Sun, 22 Jul 2018 15:19:27 +0800
Subject: [PATCH] optimize reduce_sum_grad op

---
 paddle/fluid/operators/reduce_op.h            | 29 ++++++++++++
 paddle/fluid/operators/reduce_sum_op.h        |  2 +-
 python/paddle/fluid/layers/nn.py              |  4 +-
 .../fluid/tests/unittests/test_reduce_op.py   | 44 ++++++++++---------
 4 files changed, 56 insertions(+), 23 deletions(-)
diff --git a/paddle/fluid/operators/reduce_op.h b/paddle/fluid/operators/reduce_op.h
index 72b6cf177..735ad3af2 100644
--- a/paddle/fluid/operators/reduce_op.h
+++ b/paddle/fluid/operators/reduce_op.h
@@ -88,6 +88,35 @@ class ReduceGradKernel : public framework::OpKernel<T> {
     auto* output = context.Output<Tensor>(framework::GradVarName("X"));
     output->mutable_data<T>(context.GetPlace());
 
+    if (context.GetPlace().type() == typeid(platform::CPUPlace)) {
+      const auto* input2_d = input2->data<T>();
+      auto* output_d = output->data<T>();
+
+      // CPU reduce_all_grad
+      if (reduce_all) {
+        PADDLE_ENFORCE(input2->dims().size() == 1 && input2->dims()[0] == 1,
+                       "output should be a scalar");
+        for (int64_t i = 0; i < framework::product(input0->dims()); ++i) {
+          output_d[i] = input2_d[0];
+        }
+        return;
+      }
+
+      if (input0->dims().size() == 2 && dims.size() == 1) {
+        auto& input_dim = input0->dims();
+        for (int64_t i = 0; i < input_dim[0]; ++i) {
+          for (int64_t j = 0; j < input_dim[1]; ++j) {
+            if (dims[0] == 0) {
+              output_d[i * input_dim[1] + j] = input2_d[j];
+            } else {
+              output_d[i * input_dim[1] + j] = input2_d[i];
+            }
+          }
+        }
+        return;
+      }
+    }
+
     if (reduce_all) {
       auto x = EigenVector<T>::Flatten(*input0);
       auto x_reduce = EigenVector<T>::From(*input1);
diff --git a/paddle/fluid/operators/reduce_sum_op.h b/paddle/fluid/operators/reduce_sum_op.h
index e67d7e1da..248782ce9 100644
--- a/paddle/fluid/operators/reduce_sum_op.h
+++ b/paddle/fluid/operators/reduce_sum_op.h
@@ -31,7 +31,7 @@ struct SumGradFunctor {
             typename DY, typename Dim>
   void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
                   const Dim& dim, int size) {
-    dx->device(place) = dy->broadcast(dim);
+    dx->device(place) = dy->eval().broadcast(dim);
   }
 };
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index ab40d0c21..4df806216 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -2961,7 +2961,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
             # x is a Tensor variable with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
-            # Each example is followed by the correspending output tensor.
+            # Each example is followed by the corresponding output tensor.
             fluid.layers.reduce_sum(x)  # [3.5]
             fluid.layers.reduce_sum(x, dim=0)  # [0.3, 0.5, 1.1, 1.6]
             fluid.layers.reduce_sum(x, dim=-1)  # [1.9, 1.6]
@@ -2970,7 +2970,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
             # x is a Tensor variable with shape [2, 2, 2] and elements as below:
             #      [[[1, 2], [3, 4]],
             #      [[5, 6], [7, 8]]]
-            # Each example is followed by the correspending output tensor.
+            # Each example is followed by the corresponding output tensor.
             fluid.layers.reduce_sum(x, dim=[1, 2]) # [10, 26]
             fluid.layers.reduce_sum(x, dim=[0, 1]) # [16, 20]
 
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index 865c2b7df..dbc289264 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -89,15 +89,11 @@ class TestProdOp(OpTest):
         self.check_grad(['X'], 'Out')
 
 
-class TestKeepDimReduce(OpTest):
+class Test1DReduce(OpTest):
     def setUp(self):
         self.op_type = "reduce_sum"
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.attrs = {'dim': [-2], 'keep_dim': True}
-        self.outputs = {
-            'Out':
-            self.inputs['X'].sum(axis=tuple(self.attrs['dim']), keepdims=True)
-        }
+        self.inputs = {'X': np.random.random(20).astype("float64")}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
 
     def test_check_output(self):
         self.check_output()
@@ -106,32 +102,40 @@ class TestKeepDimReduce(OpTest):
         self.check_grad(['X'], 'Out')
 
 
-class Test1DReduce(OpTest):
+class Test2DReduce0(Test1DReduce):
     def setUp(self):
         self.op_type = "reduce_sum"
-        self.inputs = {'X': np.random.random(20).astype("float64")}
+        self.attrs = {'dim': [0]}
+        self.inputs = {'X': np.random.random((20, 10)).astype("float64")}
         self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
 
-    def test_check_output(self):
-        self.check_output()
 
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+class Test2DReduce1(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': [1]}
+        self.inputs = {'X': np.random.random((20, 10)).astype("float64")}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=1)}
+
+
+class TestKeepDimReduce(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.attrs = {'dim': [-2], 'keep_dim': True}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
+                                        keepdims=self.attrs['keep_dim'])
+        }
 
 
-class TestReduceAll(OpTest):
+class TestReduceAll(Test1DReduce):
     def setUp(self):
         self.op_type = "reduce_sum"
         self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
         self.attrs = {'reduce_all': True}
         self.outputs = {'Out': self.inputs['X'].sum()}
 
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-
 
 ## reduction in multi dims
 class TestReduceMeanOpMultiAxises(OpTest):
-- 
GitLab