add unused input vars check for OpWithKernel, test=develop (#21169)

* add unused input vars check for OpWithKernel, test=develop * remove unused vars in some ops, test=develop * fix batch_norm, test=develop * add white list, test=develop * add CI check for white list, test=develop * :ove white list to c++, test=develop * solve failure of CI, test=develop * add unittest for unused_var_check, test=develop * refine code, enable check in operator_test, test=develop * skip mkldnn, test=develop * extend white list, test=develop * refine condition of mkldnn, test=develop * fix paddle_build, test=develop * follow comments, test=develop * fix GetExpectedKernelType * add wiki ref to err_msg, test=develop * follow comment, test=develop

add unused input vars check for OpWithKernel, test=develop (#21169)
* add unused input vars check for OpWithKernel, test=develop * remove unused vars in some ops, test=develop * fix batch_norm, test=develop * add white list, test=develop * add CI check for white list, test=develop * :ove white list to c++, test=develop * solve failure of CI, test=develop * add unittest for unused_var_check, test=develop * refine code, enable check in operator_test, test=develop * skip mkldnn, test=develop * extend white list, test=develop * refine condition of mkldnn, test=develop * fix paddle_build, test=develop * follow comments, test=develop * fix GetExpectedKernelType * add wiki ref to err_msg, test=develop * follow comment, test=develop
e0c9d856 · Leo Chen · GitHub · 664f958a · e0c9d856 · e0c9d856
17 changed file
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -85,6 +85,7 @@ endif()
 cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits)

 cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash var_type_traits)
+
 cc_library(scope_pool SRCS scope_pool.cc DEPS scope)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
 cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits)
@@ -128,8 +129,11 @@ cc_test(no_need_buffer_vars_inference_test SRCS no_need_buffer_vars_inference_te

 cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context)
 cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place)
+
+cc_library(unused_var_check SRCS unused_var_check.cc DEPS glog no_need_buffer_vars_inference)
+
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
-    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack)
+    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check)

 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
 cc_test(operator_exception_test SRCS operator_exception_test.cc DEPS operator op_registry device_context)

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -28,11 +28,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
+#include "paddle/fluid/framework/unused_var_check.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/profiler.h"

 DECLARE_bool(benchmark);
 DECLARE_bool(check_nan_inf);
+DECLARE_bool(enable_unused_var_check);
 DEFINE_int32(inner_op_parallelism, 0, "number of threads for inner op");
 DEFINE_bool(fast_check_nan_inf, false,
            "Fast checking NAN/INF after each operation. It will be a little"
@@ -434,6 +436,8 @@ bool ExecutionContext::HasOutput(const std::string& name) const {
 }

 const Variable* ExecutionContext::InputVar(const std::string& name) const {
+  LogVarUsageIfUnusedVarCheckEnabled(name);
+
  auto it = ctx_.inputs.find(name);
  if (it == ctx_.inputs.end()) return nullptr;

@@ -463,6 +467,8 @@ const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
 template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
    const std::string& name) const {
+  LogVarUsageIfUnusedVarCheckEnabled(name);
+
  auto vars = MultiInputVar(name);
  if (vars.size() == 0) {
    return {};
@@ -958,6 +964,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
    RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, *runtime_ctx);
    this->InferShape(&infer_shape_ctx);
  }
+
+  if (FLAGS_enable_unused_var_check) {
+    GetThreadLocalUsedVarNameSet()->clear();
+  }
+
  // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
  // not Scope. Imperative mode only pass inputs and get outputs.
  (*kernel_func_)(ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx,
@@ -967,6 +978,14 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
    // there is inplace variable has been transfered.
    TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope);
  }
+  if (FLAGS_enable_unused_var_check) {
+    // skip op that uses mkldnn because it has different memory reuse strategy.
+    // use attr here because some GradMakers (like ActivationGradOpMaker) add
+    // input when use_mkldnn=true;
+    if (!(HasAttr("use_mkldnn") && Attr<bool>("use_mkldnn"))) {
+      CheckUnusedVar(*this, scope);
+    }
+  }

  /*For profiling/benchmark only*/
  if (FLAGS_benchmark) {

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -35,6 +35,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/unused_var_check.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/variant.h"
@@ -290,6 +291,8 @@ class ExecutionContext {

  virtual const std::vector<Variable*> MultiInputVar(
      const std::string& name) const {
+    LogVarUsageIfUnusedVarCheckEnabled(name);
+
    auto it = ctx_.inputs.find(name);
    if (it == ctx_.inputs.end()) {
      return {};
@@ -330,6 +333,8 @@ class ExecutionContext {

  template <typename T>
  const std::vector<const T*> MultiInput(const std::string& name) const {
+    LogVarUsageIfUnusedVarCheckEnabled(name);
+
    auto vars = MultiInputVar(name);
    if (vars.size() == 0) {
      return {};

--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -18,6 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/init.h"

+DECLARE_bool(enable_unused_var_check);
+
 namespace paddle {
 namespace framework {

@@ -139,6 +141,8 @@ class CPUKernelTest : public OpKernel<float> {
    cpu_kernel_run_num++;
    ASSERT_EQ(ctx.InputName("x"), "IN1");
    ASSERT_EQ(ctx.OutputName("y"), "OUT1");
+    auto* x = ctx.Input<Tensor>("X");
+    ASSERT_EQ(x, nullptr);
  }
 };

@@ -626,3 +630,117 @@ void SetGetLoDLevelTestMain(std::string op_type) {
 TEST(GetLoDLevelTest, base) { SetGetLoDLevelTestMain("get_lod_level_test"); }

 TEST(SetLoDLevelTest, base) { SetGetLoDLevelTestMain("set_lod_level_test"); }
+
+namespace paddle {
+namespace framework {
+
+class OpUnusedVarTest : public OperatorWithKernel {
+ public:
+  using OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+  OpKernelType GetExpectedKernelType(
+      const ExecutionContext& ctx) const override {
+    return OpKernelType(proto::VarType::FP32, ctx.GetPlace(),
+                        framework::DataLayout::kAnyLayout);
+  }
+};
+
+class OpUnusedVarTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "input of test op");
+    AddOutput("Y", "output of test op");
+    AddComment("This is test op for unused var check.");
+  }
+};
+
+template <typename T>
+class OpWithUnusedVarKernelTest : public OpKernel<T> {
+ public:
+  void Compute(const ExecutionContext& ctx) const {
+    ASSERT_EQ(ctx.InputName("X"), "X");
+    ASSERT_EQ(ctx.OutputName("Y"), "Y");
+  }
+};
+
+template <typename T>
+class OpWithoutUnusedVarKernelTest : public OpKernel<T> {
+ public:
+  void Compute(const ExecutionContext& ctx) const {
+    ASSERT_EQ(ctx.InputName("X"), "X");
+    ASSERT_EQ(ctx.OutputName("Y"), "Y");
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Output<Tensor>("Y");
+    ASSERT_NE(x, y);
+    ASSERT_NE(y, nullptr);
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(
+    op_with_unused_var, paddle::framework::OpUnusedVarTest,
+    paddle::framework::OpUnusedVarTestProtoAndCheckerMaker);
+
+REGISTER_OP_CPU_KERNEL(op_with_unused_var,
+                       paddle::framework::OpWithUnusedVarKernelTest<float>);
+
+REGISTER_OP_WITHOUT_GRADIENT(
+    op_without_unused_var, paddle::framework::OpUnusedVarTest,
+    paddle::framework::OpUnusedVarTestProtoAndCheckerMaker);
+
+REGISTER_OP_CPU_KERNEL(op_without_unused_var,
+                       paddle::framework::OpWithoutUnusedVarKernelTest<float>);
+
+// test with single input
+TEST(OpWithUnusedVar, all) {
+  // enable the unused_var_check
+  FLAGS_enable_unused_var_check = true;
+  paddle::framework::InitDevices(true);
+  paddle::framework::proto::OpDesc op_desc;
+  op_desc.set_type("op_with_unused_var");
+  BuildVar("X", {"X"}, op_desc.add_inputs());
+  BuildVar("Y", {"Y"}, op_desc.add_outputs());
+
+  paddle::platform::CPUPlace cpu_place;
+  paddle::framework::Scope scope;
+  auto* x = scope.Var("X")->GetMutable<paddle::framework::LoDTensor>();
+  auto* y = scope.Var("Y")->GetMutable<paddle::framework::LoDTensor>();
+  x->Resize({32, 64});
+  y->Resize({32, 64});
+  x->mutable_data<float>(cpu_place);
+  y->mutable_data<float>(cpu_place);
+
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  // should throw exception
+  ASSERT_THROW(op->Run(scope, cpu_place), paddle::platform::EnforceNotMet);
+  FLAGS_enable_unused_var_check = false;
+}
+
+TEST(OpWithoutUnusedVar, all) {
+  // enable the unused_var_check
+  FLAGS_enable_unused_var_check = true;
+
+  paddle::framework::InitDevices(true);
+  paddle::framework::proto::OpDesc op_desc;
+  op_desc.set_type("op_without_unused_var");
+  BuildVar("X", {"X"}, op_desc.add_inputs());
+  BuildVar("Y", {"Y"}, op_desc.add_outputs());
+
+  paddle::platform::CPUPlace cpu_place;
+  paddle::framework::Scope scope;
+  auto* x = scope.Var("X")->GetMutable<paddle::framework::LoDTensor>();
+  auto* y = scope.Var("Y")->GetMutable<paddle::framework::LoDTensor>();
+  x->Resize({32, 64});
+  y->Resize({32, 64});
+  x->mutable_data<float>(cpu_place);
+  y->mutable_data<float>(cpu_place);
+
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  // should not throw exception
+  ASSERT_NO_THROW(op->Run(scope, cpu_place));
+  FLAGS_enable_unused_var_check = false;
+}
--- a/paddle/fluid/framework/unused_var_check.cc
+++ b/paddle/fluid/framework/unused_var_check.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include <string>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/unused_var_check.h"
+#include "paddle/fluid/platform/enforce.h"
+
+DEFINE_bool(enable_unused_var_check, false,
+            "Checking whether operator contains unused inputs, "
+            "especially for grad operator. It should be in unittest.");
+
+const std::unordered_set<std::string> op_has_unsed_vars_white_list = {
+    "auc",
+    "batch_norm",
+    "batch_norm_grad",
+    "sync_batch_norm_grad",
+    "center_loss_grad",
+    "crop",
+    "cvm",
+    "cos_sim_grad",
+    "dgc_momentum",
+    "fake_quantize_range_abs_max",
+    "fill_zeros_like",
+    "fusion_seqpool_cvm_concat",
+    "reshape2_grad_grad",
+    "reshape2_grad",
+    "gru_grad",
+    "hierarchical_sigmoid_grad",
+    "nce_grad",
+    "roi_perspective_transform_grad",
+    "sequence_conv_grad",
+    "gru_unit_grad",
+    "affine_grid_grad",
+    "fill_any_like",
+    "precision_recall",
+    "unsqueeze_grad",
+    "kldiv_loss_grad",
+    "cvm_grad",
+    "stack_grad",
+    "warpctc_grad",
+    "sync_batch_norm",
+    "match_matrix_tensor_grad",
+    "ngraph_engine"};
+
+namespace paddle {
+namespace framework {
+
+std::unordered_set<std::string> *GetThreadLocalUsedVarNameSet() {
+  thread_local std::unordered_set<std::string> used_var_name_set;
+  return &used_var_name_set;
+}
+
+void LogVarUsageIfUnusedVarCheckEnabled(const std::string &name) {
+  if (FLAGS_enable_unused_var_check) {
+    VLOG(6) << "Variable used:" << name;
+    GetThreadLocalUsedVarNameSet()->insert(name);
+  }
+}
+
+void CheckUnusedVar(const OperatorBase &op, const Scope &scope) {
+  // skip op in white list and it should be fixed in the future.
+  if (op_has_unsed_vars_white_list.count(op.Type()) != 0) {
+    return;
+  }
+  auto *used_set = GetThreadLocalUsedVarNameSet();
+  std::vector<std::string> unsed_input_var_names;
+  auto &inferer = op.Info().NoNeedBufferVarsInferer();
+  std::unordered_set<std::string> no_need_buffer_ins = {};
+  if (inferer) {
+    no_need_buffer_ins = inferer(op.Inputs(), op.Outputs(), op.Attrs());
+  }
+
+  for (auto &pair : op.Inputs()) {
+    // skip no need buffer vars declared
+    if (no_need_buffer_ins.count(pair.first) != 0) {
+      VLOG(6) << op.Type() << " " << pair.first;
+      continue;
+    }
+    if (used_set->count(pair.first) == 0) {
+      for (auto &in_var_name : pair.second) {
+        auto *in_var = scope.FindVar(in_var_name);
+        if (in_var != nullptr && in_var->IsInitialized()) {
+          auto *tensor = &in_var->Get<LoDTensor>();
+          if (tensor != nullptr && tensor->IsInitialized()) {
+            unsed_input_var_names.emplace_back(pair.first);
+            break;
+          }
+        }
+      }
+    }
+  }
+  if (!unsed_input_var_names.empty()) {
+    std::string err_msg = "Operator " + op.Type() + " has input(s) not uesed: ";
+    for (auto &in_var_name : unsed_input_var_names) {
+      err_msg += in_var_name;
+      err_msg += ", ";
+    }
+    err_msg +=
+        "please make sure it(they) is(are) needed. If not, remove it(them) "
+        "from inputs of the operator; if yes, register "
+        "NoNeedBufferVarsInference or add "
+        "the operator to "
+        "white list in unused_var_check.cc. See more details at "
+        "[https://github.com/PaddlePaddle/Paddle/wiki/"
+        "OP-Should-Not-Have-Unused-Input]";
+    PADDLE_ENFORCE_EQ(unsed_input_var_names.size(), 0,
+                      platform::errors::PermissionDenied(
+                          "Unused input variables check failed: %s", err_msg));
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/unused_var_check.h
+++ b/paddle/fluid/framework/unused_var_check.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include <string>
+#include <unordered_set>
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+
+std::unordered_set<std::string>* GetThreadLocalUsedVarNameSet();
+
+void LogVarUsageIfUnusedVarCheckEnabled(const std::string& name);
+void CheckUnusedVar(const OperatorBase& op, const Scope& scope);
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -633,37 +633,31 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
 };

 template <typename T>
-class BatchNormGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  std::unique_ptr<T> Apply() const override {
-    auto *op = new T();
-    op->SetType(this->ForwardOpType() + "_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
-
-    op->SetInput("Scale", this->Input("Scale"));
-    op->SetInput("Bias", this->Input("Bias"));
-    op->SetInput("SavedMean", this->Output("SavedMean"));
-    op->SetInput("SavedVariance", this->Output("SavedVariance"));
-
-    // used when setting use_global_stats True during training
-    if (boost::get<bool>(this->GetAttr("use_global_stats"))) {
-      op->SetInput("Mean", this->Output("MeanOut"));
-      op->SetInput("Variance", this->Output("VarianceOut"));
-    }
+std::unique_ptr<T> BatchNormGradMaker<T>::Apply() const {
+  auto *op = new T();
+  op->SetType(this->ForwardOpType() + "_grad");
+  op->SetInput("X", this->Input("X"));
+  op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
+
+  op->SetInput("Scale", this->Input("Scale"));
+  op->SetInput("Bias", this->Input("Bias"));
+  op->SetInput("SavedMean", this->Output("SavedMean"));
+  op->SetInput("SavedVariance", this->Output("SavedVariance"));
+
+  // used when setting use_global_stats True during training
+  if (boost::get<bool>(this->GetAttr("use_global_stats"))) {
+    op->SetInput("Mean", this->Output("MeanOut"));
+    op->SetInput("Variance", this->Output("VarianceOut"));
+  }

-    op->SetAttrMap(this->Attrs());
+  op->SetAttrMap(this->Attrs());

-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Scale"), this->InputGrad("Scale"));
-    op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
+  op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+  op->SetOutput(framework::GradVarName("Scale"), this->InputGrad("Scale"));
+  op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));

-    return std::unique_ptr<T>(op);
-  }
-};
+  return std::unique_ptr<T>(op);
+}

 }  // namespace operators
 }  // namespace paddle

--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -64,6 +64,15 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
  void Make() override;
 };

+template <typename T>
+class BatchNormGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  std::unique_ptr<T> Apply() const override;
+};
+
 class BatchNormOpInferVarType
    : public framework::PassInDtypeAndVarTypeToOutput {
 protected:

--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -131,9 +131,25 @@ class FlattenGradOp : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-        ctx.device_context());
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.device_context());
+  }
+};
+
+template <typename T>
+class FlattenGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  std::unique_ptr<T> Apply() const override {
+    auto *grad_op = new T();
+    grad_op->SetType("flatten_grad");
+    grad_op->SetInput("X", this->Input("X"));
+    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    grad_op->SetAttrMap(this->Attrs());
+    return std::unique_ptr<T>(grad_op);
  }
 };

@@ -234,18 +250,20 @@ DECLARE_INPLACE_OP_INFERER(FlattenOpInplaceInToOut, {"X", "Out"});
 DECLARE_INPLACE_OP_INFERER(FlattenGradInplaceinToOut,
                           {framework::GradVarName("Out"),
                            framework::GradVarName("X")});
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(FlattenGradNoNeedBufferVarsInference,
+                                      "X");

 }  // namespace operators
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    flatten, ops::FlattenOp, ops::FlattenOpMaker,
-    paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
-    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>,
-    ops::FlattenOpInplaceInToOut);
+REGISTER_OPERATOR(flatten, ops::FlattenOp, ops::FlattenOpMaker,
+                  ops::FlattenGradOpMaker<paddle::framework::OpDesc>,
+                  ops::FlattenGradOpMaker<paddle::imperative::OpBase>,
+                  ops::FlattenOpInplaceInToOut);
 REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp,
-                  ops::FlattenGradInplaceinToOut);
+                  ops::FlattenGradInplaceinToOut,
+                  ops::FlattenGradNoNeedBufferVarsInference);

 REGISTER_OPERATOR(flatten2, ops::Flatten2Op, ops::Flatten2OpMaker,
                  ops::Flatten2GradOpMaker<paddle::framework::OpDesc>,

--- a/paddle/fluid/operators/instance_norm_op.h
+++ b/paddle/fluid/operators/instance_norm_op.h
@@ -87,7 +87,6 @@ class InstanceNormGradMaker : public framework::SingleGradOpMaker<T> {
    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));

    op->SetInput("Scale", this->Input("Scale"));
-    op->SetInput("Bias", this->Input("Bias"));
    op->SetInput("SavedMean", this->Output("SavedMean"));
    op->SetInput("SavedVariance", this->Output("SavedVariance"));


--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/lrn_op.h"
+#include <memory>
 #include <string>
 #include <vector>
 #include "paddle/fluid/operators/math/blas.h"
@@ -348,14 +349,31 @@ class LRNOpGrad : public framework::OperatorWithKernel {
        layout_, library_);
  }
 };
+
+template <typename T>
+class LRNGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+  std::unique_ptr<T> Apply() const override {
+    auto* op = new T();
+    op->SetType(this->ForwardOpType() + "_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Out", this->Output("Out"));
+    op->SetInput("MidOut", this->Output("MidOut"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+    return std::unique_ptr<T>(op);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    lrn, ops::LRNOp, ops::LRNOpMaker<float>,
-    paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
-    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>);
+REGISTER_OPERATOR(lrn, ops::LRNOp, ops::LRNOpMaker<float>,
+                  ops::LRNGradOpMaker<paddle::framework::OpDesc>,
+                  ops::LRNGradOpMaker<paddle::imperative::OpBase>);

 REGISTER_OPERATOR(lrn_grad, ops::LRNOpGrad);
 REGISTER_OP_CPU_KERNEL(

--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/prelu_op.h"
+#include <memory>
 #include <string>

 namespace paddle {
@@ -130,15 +131,34 @@ class PReluGradOp : public framework::OperatorWithKernel {
  }
 };

+template <typename T>
+class PReluGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  std::unique_ptr<T> Apply() const override {
+    std::unique_ptr<T> op(new T());
+    op->SetType("prelu_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Alpha", this->Input("Alpha"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Alpha"), this->InputGrad("Alpha"));
+    op->SetAttrMap(this->Attrs());
+
+    return op;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

 namespace ops = paddle::operators;

-REGISTER_OPERATOR(
-    prelu, ops::PReluOp, ops::PReluOpMaker,
-    paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
-    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>);
+REGISTER_OPERATOR(prelu, ops::PReluOp, ops::PReluOpMaker,
+                  ops::PReluGradOpMaker<paddle::framework::OpDesc>,
+                  ops::PReluGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(prelu_grad, ops::PReluGradOp);
 REGISTER_OP_CPU_KERNEL(
    prelu, ops::PReluKernel<paddle::platform::CPUDeviceContext, float>);

--- a/paddle/fluid/operators/prelu_op.cu
+++ b/paddle/fluid/operators/prelu_op.cu
@@ -70,11 +70,11 @@ class CUDAPReluKernel : public framework::OpKernel<T> {
 enum PRELU_MODE { Element, Channel, Scalar };

 template <typename T>
-__global__ void PReluOpGradKernel(const T* x_ptr, const T* y_ptr,
-                                  const T* alpha_ptr, const T* dy_ptr,
-                                  T* dx_ptr, T* dalpha_ptr, size_t channel_num,
-                                  size_t plane_size, size_t spatial_size,
-                                  size_t numel, PRELU_MODE mode) {
+__global__ void PReluOpGradKernel(const T* x_ptr, const T* alpha_ptr,
+                                  const T* dy_ptr, T* dx_ptr, T* dalpha_ptr,
+                                  size_t channel_num, size_t plane_size,
+                                  size_t spatial_size, size_t numel,
+                                  PRELU_MODE mode) {
  size_t index;
  CUDA_KERNEL_LOOP(index, numel) {
    T scale;
@@ -98,15 +98,15 @@ __global__ void PReluOpGradKernel(const T* x_ptr, const T* y_ptr,
 template <typename T>
 class PreluOpGradFunctor {
 public:
-  void operator()(cudaStream_t stream, const T* x, const T* y, const T* alpha,
-                  const T* dy, T* dx, T* dalpha, std::vector<int> input_shape,
+  void operator()(cudaStream_t stream, const T* x, const T* alpha, const T* dy,
+                  T* dx, T* dalpha, std::vector<int> input_shape,
                  PRELU_MODE mode) {
    size_t plane_size = input_shape[2] * input_shape[3];
    size_t spatial_size = plane_size * input_shape[1];
    size_t numel = spatial_size * input_shape[0];
    PReluOpGradKernel<
        T><<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
-        x, y, alpha, dy, dx, dalpha, input_shape[1], plane_size, spatial_size,
+        x, alpha, dy, dx, dalpha, input_shape[1], plane_size, spatial_size,
        numel, mode);
  }
 };
@@ -121,14 +121,12 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Input<Tensor>("Out");
    auto* alpha = context.Input<Tensor>("Alpha");
    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
    auto* dy = context.Input<Tensor>(framework::GradVarName("Out"));
    auto* dalpha = context.Output<Tensor>(framework::GradVarName("Alpha"));

    const T* x_ptr = x->data<T>();
-    const T* y_ptr = y->data<T>();
    const T* alpha_ptr = alpha->data<T>();
    const T* dy_ptr = dy->data<T>();
    T* dx_ptr = dx ? dx->mutable_data<T>(context.GetPlace()) : nullptr;
@@ -163,7 +161,7 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> {
      m = Scalar;
    }
    PreluOpGradFunctor<T> prelu_grad;
-    prelu_grad(stream, x_ptr, y_ptr, alpha_ptr, dy_ptr, dx_ptr, dalpha_tmp_ptr,
+    prelu_grad(stream, x_ptr, alpha_ptr, dy_ptr, dx_ptr, dalpha_tmp_ptr,
               input_shape, m);

    if (dalpha_tmp_ptr == nullptr) return;

--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -123,9 +123,9 @@ class SqueezeGradOp : public framework::OperatorWithKernel {
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-        ctx.device_context());
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.device_context());
  }
 };

@@ -214,6 +214,22 @@ class Squeeze2Op : public framework::OperatorWithKernel {
  }
 };

+template <typename T>
+class SqueezeGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  std::unique_ptr<T> Apply() const override {
+    auto *grad_op = new T();
+    grad_op->SetType("squeeze_grad");
+    grad_op->SetInput("X", this->Input("X"));
+    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    grad_op->SetAttrMap(this->Attrs());
+    return std::unique_ptr<T>(grad_op);
+  }
+};
+
 class Squeeze2GradOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
@@ -274,16 +290,17 @@ DECLARE_INPLACE_OP_INFERER(SequeezeInplaceInferer, {"X", "Out"});
 DECLARE_INPLACE_OP_INFERER(SequeezeGradInplaceInferer,
                           {framework::GradVarName("Out"),
                            framework::GradVarName("X")});
-
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(SqueezeGradNoNeedBufferVarsInference,
+                                      "X");
 }  // namespace operators
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    squeeze, ops::SqueezeOp, ops::SqueezeOpMaker,
-    paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
-    paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>);
-REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp);
+REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker,
+                  ops::SqueezeGradOpMaker<paddle::framework::OpDesc>,
+                  ops::SqueezeGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp,
+                  ops::SqueezeGradNoNeedBufferVarsInference);

 REGISTER_OPERATOR(squeeze2, ops::Squeeze2Op, ops::Squeeze2OpMaker,
                  ops::Squeeze2GradOpMaker<paddle::framework::OpDesc>,

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -439,9 +439,7 @@ EOF
            pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
        fi

-        # TODO: jiabin need to refine this part when these tests fixed on mac
        ctest --output-on-failure -j $2
-
        paddle version
    fi
 }
@@ -709,6 +707,13 @@ function parallel_test() {
    parallel_test_base
 }

+function enable_unused_var_check() {
+    # NOTE(zhiqiu): Set FLAGS_enable_unused_var_check=1 here to enable unused_var_check,
+    # which checks if an operator has unused input variable(s).
+    # Currently, use it in coverage CI job.
+    export FLAGS_enable_unused_var_check=1
+}
+
 function gen_doc_lib() {
    mkdir -p ${PADDLE_ROOT}/build
    cd ${PADDLE_ROOT}/build
@@ -1079,6 +1084,7 @@ function main() {
      cicheck)
        cmake_gen ${PYTHON_ABI:-""}
        build ${parallel_number}
+        enable_unused_var_check
        parallel_test
        ;;
      cicheck_brpc)

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -168,7 +168,8 @@ def __bootstrap__():
        'print_sub_graph_dir', 'pe_profile_fname', 'inner_op_parallelism',
        'enable_parallel_graph', 'fuse_parameter_groups_size',
        'multiple_of_cupti_buffer_size', 'fuse_parameter_memory_size',
-        'tracer_profile_fname', 'dygraph_debug', 'use_system_allocator'
+        'tracer_profile_fname', 'dygraph_debug', 'use_system_allocator',
+        'enable_unused_var_check'
    ]
    if 'Darwin' not in sysstr:
        read_env_flags.append('use_pinned_memory')

--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -25,7 +25,8 @@ API_FILES=("CMakeLists.txt"
           "python/paddle/fluid/parallel_executor.py"
           "python/paddle/fluid/framework.py"
           "python/paddle/fluid/backward.py"
-           "paddle/fluid/operators/distributed/send_recv.proto.in")
+           "paddle/fluid/operators/distributed/send_recv.proto.in"
+           "paddle/fluid/framework/unused_var_check.cc")

 approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
 git_files=`git diff --numstat upstream/$BRANCH| wc -l`
@@ -73,7 +74,7 @@ for API_FILE in ${API_FILES[*]}; do
  echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
  if [ "${API_CHANGE}" ] && [ "${GIT_PR_ID}" != "" ]; then
      # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
-      # approval_user_list: XiaoguangHu01 46782768,Xreki 12538138,luotao1 6836917,sneaxiy 32832641,qingqing01 7845005,guoshengCS 14105589,heavengate 12605721,kuke 3064195,Superjomn 328693,lanxianghit 47554610,cyj1986 39645414,hutuxian 11195205,frankwhzhang 20274488,nepeplwu 45024560,Dianhai 38231817,JiabinYang 22361972,chenwhql 22561442,seiriosPlus 5442383,gongweibao 10721757,saxon-zh 2870059,Boyan-Liu 2870059. 
+      # approval_user_list: XiaoguangHu01 46782768,Xreki 12538138,luotao1 6836917,sneaxiy 32832641,qingqing01 7845005,guoshengCS 14105589,heavengate 12605721,kuke 3064195,Superjomn 328693,lanxianghit 47554610,cyj1986 39645414,hutuxian 11195205,frankwhzhang 20274488,nepeplwu 45024560,Dianhai 38231817,JiabinYang 22361972,chenwhql 22561442,zhiqiu 6888866,seiriosPlus 5442383,gongweibao 10721757,saxon-zh 2870059,Boyan-Liu 2870059. 
      if [ "${API_FILE}" == "paddle/fluid/op_use_default_grad_op_maker.spec" ];then
          echo_line="You must have one RD (sneaxiy (Recommend) or luotao1) approval for op_use_default_grad_op_maker.spec, which manages the grad_op memory optimization.\n"
          check_approval 1 32832641 6836917
@@ -89,6 +90,9 @@ for API_FILE in ${API_FILES[*]}; do
      elif [ "${API_FILE}" == "paddle/fluid/operators/distributed/send_recv.proto.in" ];then
          echo_line="You must have one RD (gongweibao or seiriosPlus) approval for the paddle/fluid/operators/distributed/send_recv.proto.in, which manages the environment variables.\n"
          check_approval 1 10721757 5442383
+      elif [ "${API_FILE}" == "paddle/fluid/framework/unused_var_check.cc" ];then
+          echo_line="You must have one RD (zhiqiu (Recommend) , sneaxiy or luotao1) approval for the paddle/fluid/framework/unused_var_check.cc, which manages the white list of operators that have unused input variables. Before change the white list, please read the spicification [https://github.com/PaddlePaddle/Paddle/wiki/OP-Should-Not-Have-Unused-Input] and try to refine code first. \n"
+          check_approval 1 6888866 32832641 6836917
      else
          echo_line="You must have one RD (XiaoguangHu01,Xreki,luotao1,sneaxiy) approval for ${API_FILE}, which manages the underlying code for fluid.\n"
          check_approval 1 3048612 46782768 12538138 6836917 32832641