fix some op grad maker

fix ctest eager deletion disable bug test=develop

fix some op grad maker
fix ctest eager deletion disable bug test=develop
7000ec85 · sneaxiy · f8ed2c22 · 7000ec85 · 7000ec85 · 7000ec85
20 changed file
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -63,7 +63,7 @@ cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
-cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory)
+cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory gflags)
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
 cc_test(reader_test SRCS reader_test.cc DEPS reader)

--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -194,10 +194,17 @@ ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx,
 }
 /**
- * Shrink op dependencies. If some ops do not Tensor buffer of any input,
+ * Shrink op dependencies accoring to no need buffer vars.
+ *
+ * If some ops do not need Tensor buffer of any input,
 * just remove the dependency of this op, i.e, decrease reference count.
 *
- * Returns whether the dependency count decreases to 0.
+ * For example, input Y of elementwise_add_grad op is only used to infer shape
+ * and lod of Y@GRAD, we do not need the buffer of input Y. Data buffer of
+ * input Y can be collected before elementwise_add_grad op runs.
+ *
+ * This method returns whether the dependency count decreases to 0, and
+ * shrinks op dependency if possible.
 */
 static bool ShrinkNoNeedBufferVarOpDependency(
    const std::string &var_name,
@@ -214,7 +221,7 @@ static bool ShrinkNoNeedBufferVarOpDependency(
        inferer(op_base->Inputs(), op_base->Outputs(), op_base->Attrs());
    // Check whether var_name occurs in other inputs or outputs of the op
-    // If it occurs, we cannot precede reference count to previous op
+    // If it occurs, we cannot decrease the dependency number.
    bool occurred_in_other_vars = false;
    for (auto &in_pair : op_base->Inputs()) {
      if (no_need_buffer_vars.count(in_pair.first) > 0) {

--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -21,6 +21,7 @@
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
+#include "gflags/gflags.h"
 #include "paddle/fluid/framework/garbage_collector.h"
 namespace paddle {

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1017,6 +1017,8 @@ Scope* OperatorWithKernel::PrepareData(
    // of search key even though the set is empty.
    if (!no_buffer_ins.empty() &&
        no_buffer_ins.count(var_name_item.first) > 0) {
+      VLOG(1) << "Skip scanning input " << var_name_item.first
+              << " in Operator " << type_;
      continue;
    }

--- a/paddle/fluid/operators/add_position_encoding_op.cc
+++ b/paddle/fluid/operators/add_position_encoding_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/add_position_encoding_op.h"
+#include <memory>
 namespace paddle {
 namespace operators {
@@ -39,13 +40,8 @@ class AddPositionEncodingOpGrad : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Out"), "Out must not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Out@GRAD must not be null.");
-    auto out_dims = ctx->GetInputDim("Out");
    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
      ctx->SetOutputDim(framework::GradVarName("X"), out_dims);
    }
  }
@@ -75,6 +71,22 @@ class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker {
  }
 };
+class AddPositionEncodingGradOpDescMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("add_position_encoding_grad");
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
 }  // namespace operators
 }  // namespace paddle
@@ -83,7 +95,7 @@ namespace plt = paddle::platform;
 REGISTER_OPERATOR(add_position_encoding, ops::AddPositionEncodingOp,
                  ops::AddPositionEncodingOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::AddPositionEncodingGradOpDescMaker);
 REGISTER_OPERATOR(add_position_encoding_grad, ops::AddPositionEncodingOpGrad);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -567,7 +567,6 @@ std::unique_ptr<framework::OpDesc> BatchNormGradMaker::Apply() const {
  op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
  op->SetInput("Scale", Input("Scale"));
-  op->SetInput("Bias", Input("Bias"));
  op->SetInput("SavedMean", Output("SavedMean"));
  op->SetInput("SavedVariance", Output("SavedVariance"));

--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -455,13 +455,13 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
  return type;
 }
-class Conv2dGradMaker : public framework::SingleGradOpDescMaker {
+class Conv2DGradMaker : public framework::SingleGradOpDescMaker {
 public:
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
  std::unique_ptr<framework::OpDesc> Apply() const override {
    auto* op = new framework::OpDesc();
-    op->SetType(GradOpType());
+    op->SetType(this->ForwardOpType() + "_grad");
    op->SetInput("Input", Input("Input"));
    op->SetInput("Filter", Input("Filter"));
    op->SetInput("Bias", Input("Bias"));
@@ -470,14 +470,33 @@ class Conv2dGradMaker : public framework::SingleGradOpDescMaker {
    op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
    op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
    op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
    op->SetAttrMap(Attrs());
    return std::unique_ptr<framework::OpDesc>(op);
  }
+};
+class Conv3DGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-  virtual std::string GradOpType() const {
+  std::unique_ptr<framework::OpDesc> Apply() const override {
-    return this->ForwardOpType() + "_grad";
+    auto* op = new framework::OpDesc();
+    op->SetType(this->ForwardOpType() + "_grad");
+    op->SetInput("Input", Input("Input"));
+    op->SetInput("Filter", Input("Filter"));
+    op->SetInput(framework::GradVarName("Output"), OutputGrad("Output"));
+    op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
+    op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
+    if (ForwardOp().Inputs().count("ResidualData") != 0) {
+      op->SetInput("ResidualData", Input("ResidualData"));
+    }
+    op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op);
  }
 };
@@ -486,17 +505,16 @@ class Conv2dGradMaker : public framework::SingleGradOpDescMaker {
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker,
-                  ops::ConvOpInferVarType, ops::Conv2dGradMaker);
+                  ops::ConvOpInferVarType, ops::Conv2DGradMaker);
 REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad);
 // depthwise convolution op
 REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
-                  ops::ConvOpInferVarType, ops::Conv2dGradMaker);
+                  ops::ConvOpInferVarType, ops::Conv2DGradMaker);
 REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad);
 REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker,
-                  ops::ConvOpInferVarType,
+                  ops::ConvOpInferVarType, ops::Conv3DGradMaker);
-                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad);
 // depthwise conv kernel

--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -238,6 +238,23 @@ class CrossEntropyGradientOp : public CrossEntropyGradientOpBase {
  }
 };
+class CrossEntropyGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("cross_entropy_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("Label", Input("Label"));
+    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
 class CrossEntropyOp2 : public CrossEntropyOpBase {
 public:
  using CrossEntropyOpBase::CrossEntropyOpBase;
@@ -354,7 +371,7 @@ using CPUCtx = paddle::platform::CPUDeviceContext;
 REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOpBase,
                  ops::CrossEntropyOpMaker, ops::CrossEntropyOpInferVarType,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::CrossEntropyGradOpDescMaker);
 REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp);
 REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<CPUCtx, float>,
                       ops::CrossEntropyOpKernel<CPUCtx, double>);

--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <memory>
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
@@ -170,11 +171,6 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasInput("Input"),
                   "Input(Input) of LSTM should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("last_h"),
-                   "Input(last_h) of LSTM should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("last_c"),
-                   "Input(last_c) of LSTM should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("Cache"),
                   "Input(last_c) of LSTM should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("InitH"),
@@ -197,6 +193,35 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel {
  }
 };
+class CudnnLSTMGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("cudnn_lstm_grad");
+    op->SetInput("Input", Input("Input"));
+    op->SetInput("InitH", Input("InitH"));
+    op->SetInput("InitC", Input("InitC"));
+    op->SetInput("W", Input("W"));
+    if (ForwardOp().Inputs().count("Cache") > 0) {
+      op->SetInput("Cache", Input("Cache"));
+    }
+    op->SetInput("Out", Output("Out"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetInput(framework::GradVarName("last_c"), OutputGrad("last_c"));
+    op->SetInput(framework::GradVarName("last_h"), OutputGrad("last_h"));
+    op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
+    op->SetOutput(framework::GradVarName("W"), InputGrad("W"));
+    op->SetOutput(framework::GradVarName("InitH"), InputGrad("InitH"));
+    op->SetOutput(framework::GradVarName("InitC"), InputGrad("InitC"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
 template <typename T>
 class NotImpleKernel : public framework::OpKernel<T> {
 public:
@@ -211,7 +236,7 @@ class NotImpleKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(cudnn_lstm, ops::CudnnLSTMOp, ops::CudnnLSTMOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::CudnnLSTMGradOpDescMaker);
 REGISTER_OPERATOR(cudnn_lstm_grad, ops::CudnnLSTMGradOp);
 REGISTER_OP_CPU_KERNEL(cudnn_lstm, ops::NotImpleKernel<float>);

--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/lod_reset_op.h"
+#include <memory>
 namespace paddle {
 namespace operators {
@@ -146,18 +147,39 @@ class LoDResetGradOp : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+    return framework::OpKernelType(
-                                   ctx.device_context());
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"))->type(),
+        ctx.device_context());
  }
 };
+class LoDResetGradDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("lod_reset_grad");
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetInput("X", Input("X"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(LoDResetGradNoNeedBufferVarInference,
+                                      "X");
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::LoDResetGradDescMaker);
-REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp);
+REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp,
+                  ops::LoDResetGradNoNeedBufferVarInference);
 REGISTER_OP_CPU_KERNEL(
    lod_reset, ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
    ops::LoDResetKernel<paddle::platform::CPUPlace, double>,

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -75,6 +75,10 @@ DEFINE_bool(reader_queue_speed_test_mode, false,
            "If set true, the queue.pop will only get data from queue but not "
            "remove the data from queue for speed testing");
+DECLARE_double(eager_delete_tensor_gb);
+DECLARE_bool(fast_eager_deletion_mode);
+DECLARE_double(memory_fraction_of_eager_deletion);
 // disable auto conversion to list in Python
 PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
@@ -150,6 +154,16 @@ PYBIND11_MODULE(core, m) {
        return paddle::operators::AppendPythonCallableObjectAndReturnId(py_obj);
      });
+  // NOTE(zjl): ctest would load environment variables at the beginning even
+  // though we have not `import paddle.fluid as fluid`. So we add this API
+  // to enable eager deletion mode in unittest.
+  m.def("_set_eager_deletion_mode",
+        [](double threshold, double fraction, bool fast_mode) {
+          FLAGS_eager_delete_tensor_gb = threshold;
+          FLAGS_memory_fraction_of_eager_deletion = fraction;
+          FLAGS_fast_eager_deletion_mode = fast_mode;
+        });
  m.add_object("_cleanup",
               py::capsule([]() { ScopePool::Instance().Clear(); }));

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
@@ -14,8 +14,6 @@
 import os
 import numpy as np
-os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0'
-os.environ['FLAGS_fast_eager_deletion_mode'] = '1'
 os.environ['FLAGS_use_ngraph'] = '0'
 os.environ['FLAGS_use_mkldnn'] = '0'
 os.environ['CPU_NUM'] = '4'
@@ -25,6 +23,8 @@ import six
 import unittest
 import multiprocessing
+fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
 def simple_fc_net():
    image = fluid.layers.data(name='image', shape=[784], dtype='float32')

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 import os
-os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0'
 os.environ['CPU_NUM'] = '2'
 import six

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
@@ -16,6 +16,8 @@ import unittest
 from test_eager_deletion_dynamic_rnn_base import TestBase
 import paddle.fluid as fluid
+fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
 def gru_net(data,
            label,

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
@@ -16,6 +16,8 @@ from test_eager_deletion_dynamic_rnn_base import TestBase
 import paddle.fluid as fluid
 import unittest
+fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
 def lstm_net(data,
             label,

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
@@ -14,7 +14,8 @@
 import os
 import unittest
-os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
 # FIXME(zjl): It seems that this unittest fails randomly 
 # when comparing all reduce last loss and reduce last loss

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
@@ -14,7 +14,8 @@
 import os
 import unittest
-os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
 os.environ['RECORDIO_FILENAME'] = './eager_deletion_transformer.wmt16.recordio'

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py
@@ -16,8 +16,6 @@ from __future__ import print_function
 import os
 os.environ['CPU_NUM'] = '2'
-os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0'
-os.environ['FLAGS_fast_eager_deletion_mode'] = '1'
 import unittest
 import paddle.fluid as fluid
@@ -29,6 +27,8 @@ import paddle.fluid.compiler as compiler
 import numpy
 import multiprocessing
+fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
 class TestEagerDeletionWhileOpBase(unittest.TestCase):
    def test_main(self):

--- a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
@@ -14,11 +14,12 @@
 import os
 import unittest
-os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+import paddle.fluid as fluid
-os.environ['FLAGS_memory_fraction_of_eager_deletion'] = "0.55"
 os.environ['RECORDIO_FILENAME'] = './p_gc_transformer.wmt16.recordio'
+fluid.core._set_eager_deletion_mode(0.0, 0.55, True)
 from test_parallel_executor_transformer import TestTransformer
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_roi_align_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
@@ -168,3 +168,7 @@ class TestROIAlignOp(OpTest):
    def test_check_grad(self):
        self.check_grad(['X'], 'Out')
+if __name__ == '__main__':
+    unittest.main()