From 2c6d07570ad2b8f15d15f4fec84ea66c9e31c07f Mon Sep 17 00:00:00 2001
From: Jiabin Yang <marsyang199376@gmail.com>
Date: Mon, 7 Oct 2019 19:48:55 +0800
Subject: [PATCH] test=release/1.6, Fix/auto prune error on leaf (#20056)
 (#20170)

* test=develop, fix docker with paddle nccl problem

* test=develop, Add Variable api and refine dygraph related API

* test=develop, Add Variable api and refine dygraph related API

* test=develop, refine test for new api and error info

* test=develop, refine error info and test_layers

* test=develop, add API.spec

* test=devleop, fix to_string python2 and python3 compat error and refien doc

* test=devleop, add API spec

* test=devleop, update API spec

* test=devleop, update API spec

* test=develop, invoke ci

* test=develop, fix example code

* test=develop, update API spec

* test=develop, fix auto_prune_error_on_leaf

* test=develop, fix auto prune error on loss stop_gradient

* test=develop, remove useless error check

* test=develop, add more ut for sorted gradient
---
 paddle/fluid/imperative/engine.cc             |  5 +-
 .../fluid/imperative/gradient_accumulator.cc  | 33 ++++++++--
 .../unittests/test_imperative_auto_prune.py   | 66 +++++++++++++++++++
 3 files changed, 95 insertions(+), 9 deletions(-)
diff --git a/paddle/fluid/imperative/engine.cc b/paddle/fluid/imperative/engine.cc
index 3a41bafbfc4..877e6ceb6a4 100644
--- a/paddle/fluid/imperative/engine.cc
+++ b/paddle/fluid/imperative/engine.cc
@@ -44,8 +44,9 @@ void BasicEngine::Init(VarBase* var, const detail::BackwardStrategy& strategy) {
   const std::vector<OpBase*> ops = var->GradVarBase()->GradOps();
   var->ClearGradOps();
 
-  if (ops.empty()) {
-    VLOG(3) << "Skip auto grad since there is no grad op for var: "
+  if (ops.empty() || var->OverridedStopGradient()) {
+    VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
+               "stop_gradient=True: "
             << var->Name();
     return;
   } else {
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 509415a3672..873164fc287 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -116,11 +116,21 @@ void EagerGradientAccumulator::Add(std::shared_ptr<VarBase> var,
   } else {
     if (!var_->Var().IsInitialized() ||
         !var_->Var().Get<framework::LoDTensor>().IsInitialized()) {
-      VLOG(6) << "Set StopGradient Grad: " << var->Name() << " as zero";
+      VLOG(6) << "Set StopGradient Grad: " << var_->Name() << " as zero ";
+
       auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      auto* tensor = var_->MutableVar()->GetMutable<framework::LoDTensor>();
-      tensor->mutable_data(place, var->DataType());
-      operators::math::set_constant(*dev_ctx, tensor, 0.0);
+      if (!var_->Var().IsInitialized()) {
+        auto* tensor = var_->MutableVar()->GetMutable<framework::LoDTensor>();
+        VLOG(6) << "Dims of " << var_->Name() << " is set as: "
+                << var->Var().Get<framework::LoDTensor>().dims();
+        tensor->Resize(var->Var().Get<framework::LoDTensor>().dims());
+        tensor->mutable_data(place, var->DataType());
+        operators::math::set_constant(*dev_ctx, tensor, 0.0);
+      } else {
+        auto* tensor = var_->MutableVar()->GetMutable<framework::LoDTensor>();
+        tensor->mutable_data(place, var->DataType());
+        operators::math::set_constant(*dev_ctx, tensor, 0.0);
+      }
     }
   }
   ++cur_cnt_;
@@ -162,9 +172,18 @@ void SortedGradientAccumulator::Add(std::shared_ptr<VarBase> var,
         !var_->Var().Get<framework::LoDTensor>().IsInitialized()) {
       VLOG(6) << "Set StopGradient Grad: " << var->Name() << " as zero";
       auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
-      auto* tensor = var_->MutableVar()->GetMutable<framework::LoDTensor>();
-      tensor->mutable_data(place, var->DataType());
-      operators::math::set_constant(*dev_ctx, tensor, 0.0);
+      if (!var_->Var().IsInitialized()) {
+        auto* tensor = var_->MutableVar()->GetMutable<framework::LoDTensor>();
+        VLOG(6) << "Dims of " << var_->Name() << " is set as: "
+                << var->Var().Get<framework::LoDTensor>().dims();
+        tensor->Resize(var->Var().Get<framework::LoDTensor>().dims());
+        tensor->mutable_data(place, var->DataType());
+        operators::math::set_constant(*dev_ctx, tensor, 0.0);
+      } else {
+        auto* tensor = var_->MutableVar()->GetMutable<framework::LoDTensor>();
+        tensor->mutable_data(place, var->DataType());
+        operators::math::set_constant(*dev_ctx, tensor, 0.0);
+      }
     }
     // looks like tmp_grad_vars will not have any member but just in case
     tmp_grad_vars_.clear();
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
index ac849e1cfb8..b84b2ac50a8 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
@@ -241,6 +241,72 @@ class TestImperativeAutoPrune(unittest.TestCase):
             self.assertTrue((fc._w.gradient() == 0).all())
             self.assertTrue((out1.gradient() == 0).all())
 
+    def test_auto_prune8(self):
+        with fluid.dygraph.guard():
+            value0 = np.arange(26).reshape(2, 13).astype("float32")
+            value1 = np.arange(6).reshape(2, 3).astype("float32")
+            value2 = np.arange(10).reshape(2, 5).astype("float32")
+            fc = fluid.FC("fc1", size=5, dtype="float32")
+            fc2 = fluid.FC("fc2", size=3, dtype="float32")
+            a = fluid.dygraph.to_variable(value0)
+            b = fluid.dygraph.to_variable(value1)
+            c = fluid.dygraph.to_variable(value2)
+            out1 = fc(a)
+            fc_origin = fc._w.numpy()
+            out2 = fc2(out1)
+            fc2_origin = fc2._w.numpy()
+            fc2._w.stop_gradient = True
+            out2.backward()
+            optimizer = fluid.optimizer.SGD(learning_rate=0.003)
+            optimizer.minimize(out2)
+            self.assertTrue(np.array_equal(fc2_origin, fc2._w.numpy()))
+            self.assertFalse(np.array_equal(fc_origin, fc._w.numpy()))
+
+    def test_auto_prune9(self):
+        with fluid.dygraph.guard():
+            value0 = np.arange(26).reshape(2, 13).astype("float32")
+            value1 = np.arange(6).reshape(2, 3).astype("float32")
+            value2 = np.arange(10).reshape(2, 5).astype("float32")
+            fc = fluid.FC("fc1", size=5, dtype="float32")
+            fc2 = fluid.FC("fc2", size=3, dtype="float32")
+            a = fluid.dygraph.to_variable(value0)
+            b = fluid.dygraph.to_variable(value1)
+            c = fluid.dygraph.to_variable(value2)
+            out1 = fc(a)
+            fc_origin = fc._w.numpy()
+            out2 = fc2(out1)
+            fc2_origin = fc2._w.numpy()
+            out2.stop_gradient = True
+            out2.backward()
+            optimizer = fluid.optimizer.SGD(learning_rate=0.003)
+            optimizer.minimize(out2)
+            self.assertTrue(np.array_equal(fc2_origin, fc2._w.numpy()))
+            self.assertTrue(np.array_equal(fc_origin, fc._w.numpy()))
+            try:
+                fc2._w.gradient()
+            except ValueError as e:
+                assert type(e) == ValueError
+
+    def test_auto_prune10(self):
+        with fluid.dygraph.guard():
+            value0 = np.arange(26).reshape(2, 13).astype("float32")
+            value1 = np.arange(6).reshape(2, 3).astype("float32")
+            value2 = np.arange(10).reshape(2, 5).astype("float32")
+            fc = fluid.FC("fc1", size=5, dtype="float32")
+            fc2 = fluid.FC("fc2", size=3, dtype="float32")
+            a = fluid.dygraph.to_variable(value0)
+            b = fluid.dygraph.to_variable(value1)
+            c = fluid.dygraph.to_variable(value2)
+            out1 = fc(a)
+            out2 = fc2(b)
+            out1.stop_gradient = True
+            out = fluid.layers.concat(input=[out1, out2, c], axis=1)
+            backward_strategy = fluid.dygraph.BackwardStrategy()
+            backward_strategy.sort_sum_gradient = True
+            out.backward(backward_strategy)
+            self.assertTrue((fc._w.gradient() == 0).all())
+            self.assertTrue((out1.gradient() == 0).all())
+
     def test_auto_prune_with_optimizer(self):
         vocab_size = 100
         size = 20
-- 
GitLab