Check whether there is any inplace operation affecting gradient calculation. (#27901)

* Add a class TensorInplaceVersion to count the inplace version and put it in framework::Tensor instead of Allocation or Variable. * Add a new attribute `_inplace_version` for VarBase. * Raise exception if an inplace operation can result in incorrect gradient computation. * Add a new interface _bump_inplace_version() for VarBase to bump the version whenever the Tensor is modified through an inplace operation. * For api assign, call _bump_inplace_version() when it's an inplace operation inn dynamic mode. * Use original var_wrapper if the inplace_version is not changed. * Replace SnapshotVarWrapperList with SnapshotVarWrapper to optimize performane.

Check whether there is any inplace operation affecting gradient calculation. (#27901)
* Add a class TensorInplaceVersion to count the inplace version and put it in framework::Tensor instead of Allocation or Variable. * Add a new attribute `_inplace_version` for VarBase. * Raise exception if an inplace operation can result in incorrect gradient computation. * Add a new interface _bump_inplace_version() for VarBase to bump the version whenever the Tensor is modified through an inplace operation. * For api assign, call _bump_inplace_version() when it's an inplace operation inn dynamic mode. * Use original var_wrapper if the inplace_version is not changed. * Replace SnapshotVarWrapperList with SnapshotVarWrapper to optimize performane.
865a4598 · liym27 · GitHub · c21a9797 · 865a4598 · 865a4598
12 changed file
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -43,6 +43,49 @@ namespace framework {

 class LoDTensor;

+/*
+ NOTE(liym27): [ What is TensorInplaceVersion used for? ]
+
+ TensorInplaceVersion is a version counter and every Tensor has a version
+ counter. It's used to check whether an inplace operation will result in an
+ incorrect gradient calculation. Version is incremented when the data of the
+ Variable is modified in place.
+
+ - Question: In what scenarios will version counters be shared?
+ - Answer: When two Variables/VarBases share the same C++ Tensor(its Allocation
+ may change), both of them share the same version counter. For examples:
+  1. `z = paddle.assign(input=x, output=y)`, `z` shares the same version counter
+    of `y` because z and y is the same VarBase;
+  2. `y = x.detach()`, `y` shares the same version counter of `x`.
+
+ - Question: In what scenarios will version counters NOT be shared?
+ - Answer: Replacing a `Variable`'s data by calling `Tensor::ShareDataWith(...)`
+ or `Tensor::ShareBufferWith(...)`. Because they share the same Allocation but
+ not framework::Tensor.
+
+ - Question: Why put the inplace_version_counter_ in framework::Tensor instead
+ of Allocation or Variable?
+ - Answer:
+  1. Tensor can call ResetHolder() to reset the corresponding Allocation so that
+  the inplace_version_counter_ changes if it's in Allocation, which will lead to
+  confusing information about inplace version.
+  2. If inplace_version_counter_ is in Variable, different VariableWrappers
+  should be able to share the same Variable. However, a VariableWrapper hold a
+  Variable object but not a pointer.
+*/
+
+class TensorInplaceVersion {
+ public:
+  explicit TensorInplaceVersion(uint32_t inplace_version = 0)
+      : inplace_version_(inplace_version) {}
+  bool IsUnique() const { return inplace_version_ == 0; }
+  void Bump() { ++inplace_version_; }
+  uint32_t CurrentVersion() const { return inplace_version_; }
+
+ private:
+  uint32_t inplace_version_;
+};
+
 class Tensor {
 #ifdef PADDLE_WITH_MKLDNN

@@ -189,6 +232,9 @@ class Tensor {

  void ResetHolderWithType(std::shared_ptr<memory::Allocation> holder,
                           const proto::VarType::Type type);
+  TensorInplaceVersion& InplaceVersionCounter() {
+    return inplace_version_counter_;
+  }

 private:
  /*! holds the memory block if allocated. */
@@ -225,6 +271,7 @@ class Tensor {
   *          PlaceHolder::ptr_ and where the tensor data really begins.
   */
  size_t offset_;
+  TensorInplaceVersion inplace_version_counter_;
 };

 }  // namespace framework

--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
@@ -18,8 +18,8 @@
 #include <typeindex>
 #include <typeinfo>

+#include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type_traits.h"
-
 namespace paddle {
 namespace framework {

@@ -69,6 +69,15 @@ class Variable {
    return holder_->Type();
  }

+ private:
+  // This method hides type T, so it doesn't appear as a template parameter of
+  // Variable.
+  framework::TensorInplaceVersion* InplaceVersionCounter();
+
+ public:
+  uint32_t CurrentInplaceVersion();
+  void BumpInplaceVersion();
+
 private:
  struct Placeholder {
    virtual ~Placeholder() PADDLE_MAY_THROW {}
@@ -101,8 +110,48 @@ class Variable {
  };

  // pointers to a PlaceholderImpl object indeed.
-  std::unique_ptr<Placeholder> holder_;
+  std::shared_ptr<Placeholder> holder_;
 };

+inline framework::TensorInplaceVersion* Variable::InplaceVersionCounter() {
+  framework::TensorInplaceVersion* version_counter_ptr(nullptr);
+  if (IsType<framework::LoDTensor>()) {
+    version_counter_ptr =
+        &GetMutable<framework::LoDTensor>()->InplaceVersionCounter();
+  } else if (IsType<framework::Tensor>()) {
+    version_counter_ptr =
+        &GetMutable<framework::Tensor>()->InplaceVersionCounter();
+
+  } else if (IsType<framework::SelectedRows>()) {
+    version_counter_ptr = &GetMutable<framework::SelectedRows>()
+                               ->mutable_value()
+                               ->InplaceVersionCounter();
+  } else {
+    VLOG(4) << "Only supports Tensor, LoDTensor, SelectedRows to have "
+               "TensorInplaceVersion, but received type "
+            << platform::demangle(framework::ToTypeName(Type()));
+  }
+  return version_counter_ptr;
+}
+
+inline uint32_t Variable::CurrentInplaceVersion() {
+  auto version_counter_ptr = InplaceVersionCounter();
+  if (version_counter_ptr) {
+    return version_counter_ptr->CurrentVersion();
+  } else {
+    return 0;
+  }
+}
+
+inline void Variable::BumpInplaceVersion() {
+  auto version_counter_ptr = InplaceVersionCounter();
+  if (version_counter_ptr) {
+    return version_counter_ptr->Bump();
+  } else {
+    VLOG(4) << "Only supports Tensor, LoDTensor, SelectedRows to have "
+               "TensorInplaceVersion, but received type "
+            << platform::demangle(framework::ToTypeName(Type()));
+  }
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -225,6 +225,31 @@ void BasicEngine::Execute() {
        }
      }

+      VLOG(4) << "Check whether there is any inplace operation affecting "
+                 "gradient calculation.";
+      for (auto& pair : bwd_ins) {
+        for (auto& var_wrapper : pair.second) {
+          auto wrapper_version_snapshot = var_wrapper->InplaceVersionSnapshot();
+          auto tensor_version =
+              var_wrapper->MutableVar()->CurrentInplaceVersion();
+          PADDLE_ENFORCE_EQ(
+              tensor_version, wrapper_version_snapshot,
+              platform::errors::PermissionDenied(
+                  "Tensor '%s' used in gradient computation in grad op '%s' "
+                  "has been "
+                  "modified by an inplace operation. "
+                  "Its version is %s but the expected version is %s. "
+                  "Please fix your code to void calling an inplace operator "
+                  "after using the Tensor which will used in gradient "
+                  "computation.",
+                  var_wrapper->Name(), cur_op.Type(), tensor_version,
+                  wrapper_version_snapshot));
+
+          VLOG(6) << " The version of Tensor '" << var_wrapper->Name()
+                  << "' is [ " << wrapper_version_snapshot << " ]";
+        }
+      }
+
      {
        VLOG(3) << "Start to execute grad op " << cur_op.Type();
        OpBase::Run(cur_op.InnerOp(), bwd_ins, tmp_outs, cur_op.Attrs(),

--- a/paddle/fluid/imperative/dygraph_grad_maker.h
+++ b/paddle/fluid/imperative/dygraph_grad_maker.h
@@ -147,7 +147,6 @@ class GradOpBaseMakerBase {
                                               bool is_input) const {
    const auto& data_map = is_input ? var_base_map_in_ : var_base_map_out_;
    auto iterator = data_map.find(name);
-
    TracedVarList<VarBase, kRole> vec_temp;
    if (iterator != data_map.end()) {
      vec_temp.reserve(iterator->second.size());
@@ -226,6 +225,7 @@ class TracedGradOp {
    }

    auto var_wrappers = ToVarWrapperList<kRole>(vars);
+
    if (!var_wrappers.empty()) {
      op_->SetInput(name, std::move(var_wrappers),
                    kRole == TracedVarRole::kBackward);
@@ -293,7 +293,8 @@ class TracedGradOp {
                            var->OverridedStopGradient()))) {
        result.emplace_back();
      } else {
-        result.emplace_back(var->SharedVar());
+        auto var_wrapper = SnapshotVarWrapper(var->SharedVar());
+        result.emplace_back(var_wrapper);
        has_valid = true;
      }
    }
@@ -304,6 +305,26 @@ class TracedGradOp {
    return result;
  }

+  // Get a snapshot of VariableWrapper at a certain inplace version.
+  // The inplace version number of VariableWrapper is used for inplace
+  // detection in gradient compution.
+  static const std::shared_ptr<VariableWrapper> SnapshotVarWrapper(
+      const std::shared_ptr<VariableWrapper>& var_wrapper) {
+    // NOTE(liym27):
+    //  Use original var_wrapper if its inplace_version is not
+    //  changed. Otherwise, it will affect the accuracy of the model
+    //  results and affect double grad.
+    if (!var_wrapper->MutableVar()->IsInitialized() ||
+        var_wrapper->InplaceVersionSnapshot() ==
+            var_wrapper->MutableVar()->CurrentInplaceVersion()) {
+      return var_wrapper;
+    } else {
+      VariableWrapper new_var_wrapper = *var_wrapper.get();
+      new_var_wrapper.ResetInplaceVersion();
+      return std::make_shared<VariableWrapper>(new_var_wrapper);
+    }
+  }
+
 private:
  const std::shared_ptr<GradOpNode>& node_;
  OpBase* op_;

--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -278,6 +278,15 @@ std::shared_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
  }
 }

+void VarBase::BumpInplaceVersion() {
+  PADDLE_ENFORCE_EQ(
+      Var().IsInitialized(), true,
+      platform::errors::InvalidArgument(
+          "Tensor %s has not been initialized, please check if it has no data.",
+          Name()));
+  MutableVar()->BumpInplaceVersion();
+}
+
 void OpBase::SetType(const std::string& type) {
  op_ = framework::OpRegistry::CreateOp(type, {}, {}, {}, false);
 }

--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -202,6 +202,8 @@ class VarBase {
  std::shared_ptr<VarBase> NewVarBase(const platform::Place& dst_place,
                                      const bool blocking) const;

+  void BumpInplaceVersion();
+
 private:
  /**
   * NOTE(zengjinle): never remove the const qualifier of `var_` if you are

--- a/paddle/fluid/imperative/variable_wrapper.h
+++ b/paddle/fluid/imperative/variable_wrapper.h
@@ -174,6 +174,17 @@ class VariableWrapper {

  std::shared_ptr<LeafVarHookPipeline>& GetLeafHooks() { return leaf_hooks_; }

+  uint32_t InplaceVersionSnapshot() const { return inplace_version_snapshot_; }
+
+  void ResetInplaceVersion() {
+    auto new_version = var_.CurrentInplaceVersion();
+
+    VLOG(6) << "The wrapper version of VariableWrapper '" << name_
+            << "' will be updated from " << inplace_version_snapshot_ << "to "
+            << new_version;
+    inplace_version_snapshot_ = new_version;
+  }
+
 private:
  void SetGradVar(const std::shared_ptr<VariableWrapper>& var) {
    auto shared_var = grad_var_.lock();
@@ -244,6 +255,10 @@ class VariableWrapper {
  int overrided_stop_gradient_{-1};
  bool persistable_{false};

+  // Used for checking whether there is any inplace operation affecting gradient
+  // calculation.
+  uint32_t inplace_version_snapshot_{0};
+
  framework::proto::VarType::Type type_{framework::proto::VarType::LOD_TENSOR};
  framework::proto::VarType::Type data_type_{framework::proto::VarType::FP32};


--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -593,6 +593,10 @@ void BindImperative(py::module *m_ptr) {
               SetTensorFromPyArray(self_tensor, self_numpy,
                                    self_tensor->place(), true);
             }
+             // NOTE(liym27):
+             // Increase the version of VarBase self because __setitem__ is an
+             // inplace operator for the VarBase self.
+             self->BumpInplaceVersion();
           })
      .def("__getitem__",
           [](std::shared_ptr<imperative::VarBase> &self, py::handle _index) {
@@ -632,6 +636,28 @@ void BindImperative(py::module *m_ptr) {
               return out;
             }
           })
+      .def("_inplace_version",
+           [](imperative::VarBase &self) -> uint32_t {
+             const auto &var = self.MutableVar();
+             PADDLE_ENFORCE_EQ(
+                 var->IsInitialized(), true,
+                 platform::errors::InvalidArgument(
+                     "Tensor of %s is Empty, please check if it has no data.",
+                     self.Name()));
+             return var->CurrentInplaceVersion();
+           })
+      .def("_bump_inplace_version",
+           [](std::shared_ptr<imperative::VarBase> &self) {
+             // NOTE(liym27): _bump_inplace_version is only used for inplace
+             // operation
+             self->BumpInplaceVersion();
+           },
+           R"DOC(
+        **Notes**:
+            **This API is ONLY available in Dygraph mode.**
+            **This is a very low level API. Users should not use it directly. **
+         Bump the version whenever the Tensor is modified through an inplace operation.
+            )DOC")
      .def("numpy",
           [](imperative::VarBase &self) -> py::array {
             const auto &tensor =

--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -226,6 +226,27 @@ def monkey_patch_varbase():

        return self.gradient()

+    @property
+    def inplace_version(self):
+        """
+        The inplace version of current Tensor.
+        The version number is incremented whenever the current Tensor is modified through an inplace operation.
+
+        **Notes: This is a read-only property**
+
+        Examples:
+          .. code-block:: python
+
+            import paddle
+            var = paddle.ones(shape=[4, 2, 3], dtype="float32")
+            print(var.inplace_version)  # 0
+
+            var[1] = 2.2
+            print(var.inplace_version)  # 1
+
+        """
+        return self._inplace_version()
+
    def __str__(self):
        """
        Convert a VarBase object to a readable string.
@@ -264,8 +285,9 @@ def monkey_patch_varbase():
        ("__bool__", __bool__), ("__nonzero__", __nonzero__),
        ("_to_static_var", _to_static_var), ("set_value", set_value),
        ("block", block), ("backward", backward), ("grad", grad),
-        ("gradient", gradient), ("__str__", __str__), ("__repr__", __str__),
-        ("__module__", "paddle"), ("__name__", "Tensor")):
+        ("inplace_version", inplace_version), ("gradient", gradient),
+        ("__str__", __str__), ("__repr__", __str__), ("__module__", "paddle"),
+        ("__name__", "Tensor")):
        setattr(core.VarBase, method_name, method)

    # patch math methods for varbase

--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -13,8 +13,12 @@
 # limitations under the License.

 from __future__ import print_function
+
+import numpy
 import six
+import warnings
 from six.moves import reduce
+
 from ..layer_helper import LayerHelper
 from ..param_attr import ParamAttr
 from ..initializer import Initializer
@@ -27,8 +31,7 @@ from .layer_function_generator import templatedoc
 from . import utils
 from ..data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
 from paddle.utils import deprecated
-import numpy
-import warnings
+
 from .utils import check_shape

 __all__ = [
@@ -556,6 +559,8 @@ def assign(input, output=None):
    """
    helper = LayerHelper('assign', **locals())
    check_type(input, 'input', (Variable, numpy.ndarray), 'assign')
+    is_inplace = True if output is not None else False
+
    if isinstance(input, Variable):
        check_dtype(
            input.dtype, 'input',
@@ -600,6 +605,9 @@ def assign(input, output=None):
                value_name: values
            })

+    if is_inplace and in_dygraph_mode():
+        output._bump_inplace_version()
+
    return output



--- a/python/paddle/fluid/tests/unittests/test_inplace.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid.core as core
+
+
+class TestInplace(unittest.TestCase):
+    def test_forward_version(self):
+        with paddle.fluid.dygraph.guard():
+            var = paddle.to_tensor(np.ones((4, 2, 3)).astype(np.float32))
+            self.assertEqual(var.inplace_version, 0)
+
+            var[0] = 1.1
+            self.assertEqual(var.inplace_version, 1)
+
+            paddle.nn.functional.assign(paddle.ones(shape=[3]), var)
+
+            # NOTE(liym27): assign(input, output) is an inplace operation for output.
+            # There is inplace-related processing for api assign, var.inplace_version should be 2 not 1.
+            self.assertEqual(var.inplace_version, 2)
+
+            var[2] = 3
+            self.assertEqual(var.inplace_version, 3)
+
+    def test_backward_error(self):
+        # It raises an error because the inplace operator will result
+        # in incorrect gradient computation.
+        with paddle.fluid.dygraph.guard():
+            var_a = paddle.ones(shape=[4, 2, 3], dtype="float32")
+            var_a.stop_gradient = False
+
+            var_b = var_a**2
+
+            # Here, the gradient computation will use the value of var_b
+            var_c = var_b**2
+            var_b[1:2] = 3.3  # var_b is modified inplace after using it
+
+            var_d = var_b**2
+
+            loss = paddle.nn.functional.relu(var_c + var_d)
+            with self.assertRaisesRegexp(
+                    RuntimeError,
+                    "received tensor_version:{} != wrapper_version_snapshot:{}".
+                    format(1, 0)):
+                loss.backward()
+
+    def test_backward_success_1(self):
+        # var_b is modified inplace before using it, the inplace operator doesn't result
+        # in incorrect gradient computation.
+        with paddle.fluid.dygraph.guard():
+            var_a = paddle.ones(shape=[4, 2, 3], dtype="float32")
+            var_a.stop_gradient = False
+
+            var_b = var_a**2
+            var_b[1:2] = 3  # var_b is modified inplace before using it
+
+            # Here, the gradient computation will use the value of var_b
+            var_c = var_b**2
+            loss = var_c.sum()
+            loss.backward()
+
+    def test_backward_success_2(self):
+        # Although var_b is modified inplace after using it, it does not used in gradient computation.
+        # The inplace operator doesn't result in incorrect gradient computation.
+        with paddle.fluid.dygraph.guard():
+            var_a = paddle.ones(shape=[4, 2, 3], dtype="float32")
+            var_a.stop_gradient = False
+
+            var_b = var_a**2
+
+            var_b[1:2] = 3  # var_b is modified inplace before using it
+
+            var_c = var_b + var_b  # Here, the grad op of sum doesn't use the value of var_b
+            loss = var_c.sum()
+
+            var_b[1:2] = 3  # var_b is modified inplace after using it
+
+            loss.backward()
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -21,8 +21,6 @@ import six
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-import paddle.fluid.layers as layers
-from paddle.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_, in_dygraph_mode


 class TestVarBase(unittest.TestCase):
@@ -515,9 +513,11 @@ class TestVarBaseSetitem(unittest.TestCase):

    def _test(self, value):
        paddle.disable_static()
-        id_origin = id(self.tensor_x)
+        self.assertEqual(self.tensor_x.inplace_version, 0)

+        id_origin = id(self.tensor_x)
        self.tensor_x[0] = value
+        self.assertEqual(self.tensor_x.inplace_version, 1)

        if isinstance(value, (six.integer_types, float)):
            result = np.zeros((2, 3)).astype(np.float32) + value
@@ -529,10 +529,12 @@ class TestVarBaseSetitem(unittest.TestCase):
        self.assertEqual(id_origin, id(self.tensor_x))

        self.tensor_x[1:2] = value
+        self.assertEqual(self.tensor_x.inplace_version, 2)
        self.assertTrue(np.array_equal(self.tensor_x[1].numpy(), result))
        self.assertEqual(id_origin, id(self.tensor_x))

        self.tensor_x[...] = value
+        self.assertEqual(self.tensor_x.inplace_version, 3)
        self.assertTrue(np.array_equal(self.tensor_x[3].numpy(), result))
        self.assertEqual(id_origin, id(self.tensor_x))

@@ -553,5 +555,30 @@ class TestVarBaseSetitem(unittest.TestCase):
        self._test(3.3)


+class TestVarBaseInplaceVersion(unittest.TestCase):
+    def test_setitem(self):
+        paddle.disable_static()
+
+        var = paddle.ones(shape=[4, 2, 3], dtype="float32")
+        self.assertEqual(var.inplace_version, 0)
+
+        var[1] = 1
+        self.assertEqual(var.inplace_version, 1)
+
+        var[1:2] = 1
+        self.assertEqual(var.inplace_version, 2)
+
+    def test_bump_inplace_version(self):
+        paddle.disable_static()
+        var = paddle.ones(shape=[4, 2, 3], dtype="float32")
+        self.assertEqual(var.inplace_version, 0)
+
+        var._bump_inplace_version()
+        self.assertEqual(var.inplace_version, 1)
+
+        var._bump_inplace_version()
+        self.assertEqual(var.inplace_version, 2)
+
+
 if __name__ == '__main__':
    unittest.main()