Imperative deep-first backward process (#16605)

* Fix bug of gradient interface * shrink transformer * Right transformer * Change from width-first backward to deep-first backward process test=develop * Reverse iterator op's input test=develop * Polish code * Change the iteration direction in ingrads' map slots test=develop * Polish code test=develop

Imperative deep-first backward process (#16605)
* Fix bug of gradient interface * shrink transformer * Right transformer * Change from width-first backward to deep-first backward process test=develop * Reverse iterator op's input test=develop * Polish code * Change the iteration direction in ingrads' map slots test=develop * Polish code test=develop
12e36d38 · Qiyang Min · Jiabin Yang · 353244f4 · 12e36d38 · 12e36d38
3 changed file
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -122,14 +122,14 @@ class Autograd {
      std::map<std::string, std::vector<VarBase*>> input_grads =
          ready_op->ApplyGrad();

-      for (auto it : input_grads) {
-        const std::vector<VarBase*>& ingrads = it.second;
+      for (auto it = input_grads.rbegin(); it != input_grads.rend(); ++it) {
+        const std::vector<VarBase*>& ingrads = it->second;
        for (size_t i = 0; i < ingrads.size(); ++i) {
          if (!ingrads[i]) continue;
-          if (ready_op->input_vars_[it.first][i]->IsStopGradient()) {
+          if (ready_op->input_vars_[it->first][i]->IsStopGradient()) {
            continue;
          }
-          OpBase* pre_op = ready_op->pre_ops_[it.first][i];
+          OpBase* pre_op = ready_op->pre_ops_[it->first][i];
          if (!pre_op) continue;

          dep_counts[pre_op] -= 1;

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -493,7 +493,8 @@ class Variable(object):
        self._ivar._run_backward()

    def _gradient(self):
-        return np.array(self._ivar._grad_value())
+        new_ivar = self._ivar._grad_ivar()._copy_to(core.CPUPlace(), True)
+        return np.array(new_ivar.value().get_tensor())

    def _clear_gradient(self):
        self._ivar._clear_gradient()

--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
@@ -303,7 +303,7 @@ use_py_reader = False
 sync = False

 # how many batches we use
-batch_num = 2
+batch_num = 50

 np.random.seed = 1
 src_word_np = np.random.randint(
@@ -1076,20 +1076,17 @@ class TestDygraphTransformer(unittest.TestCase):
                                                                    4]] = out[k]

        self.assertTrue(
-            np.allclose(static_avg_cost_value, dy_avg_cost._numpy()))
+            np.array_equal(static_avg_cost_value, dy_avg_cost._numpy()))
        self.assertTrue(
-            np.allclose(static_sum_cost_value, dy_sum_cost._numpy()))
+            np.array_equal(static_sum_cost_value, dy_sum_cost._numpy()))
        self.assertTrue(
-            np.allclose(
-                static_predict_value, dy_predict._numpy(), atol=1e-5))
+            np.array_equal(static_predict_value, dy_predict._numpy()))
        self.assertTrue(
-            np.allclose(static_token_num_value, dy_token_num._numpy()))
+            np.array_equal(static_token_num_value, dy_token_num._numpy()))
        for key, value in six.iteritems(static_param_init):
-            self.assertTrue(np.allclose(value, dy_param_init[key]))
+            self.assertTrue(np.array_equal(value, dy_param_init[key]))
        for key, value in six.iteritems(static_param_updated):
-            self.assertTrue(
-                np.allclose(
-                    value, dy_param_updated[key], atol=1e-4))
+            self.assertTrue(np.array_equal(value, dy_param_updated[key]))


 if __name__ == '__main__':