diff --git a/paddle/fluid/extension/include/op_meta_info.h b/paddle/fluid/extension/include/op_meta_info.h
index 1bc044f647fbae0c4666ecda9e2a2fc3dc8ef214..9c8d9fa40f13d75af2eea970d18efc1132b3f210 100644
--- a/paddle/fluid/extension/include/op_meta_info.h
+++ b/paddle/fluid/extension/include/op_meta_info.h
@@ -38,6 +38,8 @@ class PD_DLL_DECL OpMetaInfoHelper;
 
 using Tensor = paddle::Tensor;
 
+///////////////// Util Marco Define ////////////////
+
 #define PD_DISABLE_COPY_AND_ASSIGN(classname)      \
  private:                                          \
   classname(const classname&) = delete;            \
@@ -65,6 +67,12 @@ using Tensor = paddle::Tensor;
     END_HANDLE_THE_ERROR               \
   } while (0)
 
+#define STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                        \
+  struct __test_global_namespace_##uniq_name##__ {};                          \
+  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
+                             __test_global_namespace_##uniq_name##__>::value, \
+                msg)
+
 ///////////////// Util Define and Function ////////////////
 
 inline std::string Grad(const std::string& var_name) {
@@ -288,9 +296,9 @@ class PD_DLL_DECL OpMetaInfo {
   std::vector<std::string> attrs_;
 
   // 2. func info
-  KernelFunc kernel_fn_;
-  InferShapeFunc infer_shape_fn_;
-  InferDtypeFunc infer_dtype_fn_;
+  KernelFunc kernel_fn_{nullptr};
+  InferShapeFunc infer_shape_fn_{nullptr};
+  InferDtypeFunc infer_dtype_fn_{nullptr};
 };
 
 //////////////// Op Meta Info Map /////////////////
@@ -321,20 +329,22 @@ class PD_DLL_DECL OpMetaInfoMap {
 
 class PD_DLL_DECL OpMetaInfoBuilder {
  public:
-  explicit OpMetaInfoBuilder(std::string&& name);
+  explicit OpMetaInfoBuilder(std::string&& name, size_t index);
   OpMetaInfoBuilder& Inputs(std::vector<std::string>&& inputs);
   OpMetaInfoBuilder& Outputs(std::vector<std::string>&& outputs);
   OpMetaInfoBuilder& Attrs(std::vector<std::string>&& attrs);
   OpMetaInfoBuilder& SetKernelFn(KernelFunc func);
   OpMetaInfoBuilder& SetInferShapeFn(InferShapeFunc func);
   OpMetaInfoBuilder& SetInferDtypeFn(InferDtypeFunc func);
-  OpMetaInfoBuilder& SetBackwardOp(const std::string& bwd_op_name);
 
  private:
   // Forward Op name
   std::string name_;
-  // Point to the currently constructed op meta info
+  // ref current info ptr
   OpMetaInfo* info_ptr_;
+  // The current op meta info index in vector
+  // - 0: op, 1: grad_op, 2: grad_grad_op
+  size_t index_;
 };
 
 /////////////////////// Op register API /////////////////////////
@@ -350,14 +360,25 @@ void LoadCustomOperatorLib(const std::string& dso_name);
 
 /////////////////////// Op register Macro /////////////////////////
 
-#define PD_BUILD_OP_WITH_COUNTER(op_name, counter)                  \
-  static ::paddle::OpMetaInfoBuilder __op_meta_info_##counter##__ = \
-      ::paddle::OpMetaInfoBuilder(op_name)
-
-#define PD_BUILD_OP_INNER(op_name, counter) \
-  PD_BUILD_OP_WITH_COUNTER(op_name, counter)
-
-#define PD_BUILD_OP(op_name) PD_BUILD_OP_INNER(op_name, __COUNTER__)
+#define PD_BUILD_OP(op_name)                                                   \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                              \
+      __reg_op__##op_name, "PD_BUILD_OP must be called in global namespace."); \
+  static ::paddle::OpMetaInfoBuilder __op_meta_info_##op_name##__ =            \
+      ::paddle::OpMetaInfoBuilder(#op_name, 0)
+
+#define PD_BUILD_GRAD_OP(op_name)                                        \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                        \
+      __reg_grad_op__##op_name,                                          \
+      "PD_BUILD_GRAD_OP must be called in global namespace.");           \
+  static ::paddle::OpMetaInfoBuilder __grad_op_meta_info_##op_name##__ = \
+      ::paddle::OpMetaInfoBuilder(#op_name, 1)
+
+#define PD_BUILD_DOUBLE_GRAD_OP(op_name)                                      \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                             \
+      __reg_grad_grad_op__##op_name,                                          \
+      "PD_BUILD_DOUBLE_GRAD_OP must be called in global namespace.");         \
+  static ::paddle::OpMetaInfoBuilder __grad_grad_op_meta_info_##op_name##__ = \
+      ::paddle::OpMetaInfoBuilder(#op_name, 2)
 
 }  // namespace paddle
 
diff --git a/paddle/fluid/extension/src/op_meta_info.cc b/paddle/fluid/extension/src/op_meta_info.cc
index d362282b8d9d24c287e51643d3aca72d9fd36c50..20129435f26b1423d046a5ffea07b1c1c2b226af 100644
--- a/paddle/fluid/extension/src/op_meta_info.cc
+++ b/paddle/fluid/extension/src/op_meta_info.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/custom_operator.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 
@@ -62,11 +63,38 @@ OpMetaInfoMap::GetMap() const {
 
 //////////////// Op Meta Info Builder /////////////////
 
-OpMetaInfoBuilder::OpMetaInfoBuilder(std::string&& name) {
+OpMetaInfoBuilder::OpMetaInfoBuilder(std::string&& name, size_t index) {
+  // 1. member assign
   name_ = std::forward<std::string>(name);
+  index_ = index;
+
+  // 2. check and meta info build
   auto& info_vector = OpMetaInfoMap::Instance()[name_];
+  // index check
+  PADDLE_ENFORCE_EQ(
+      info_vector.size(), index_,
+      platform::errors::PreconditionNotMet(
+          "The operator %s's meta info register failed. "
+          "Please make sure you call marcos as order `PD_BUILD_OP`, "
+          "`PD_BUILD_GRAD_OP`, `PD_BUILD_DOUBLE_GRAD_OP`.",
+          name_));
+  switch (index_) {
+    case 0:
+      break;
+    case 1:
+      name_ = name_ + "_grad";
+      break;
+    case 2:
+      name_ = name_ + "_grad_grad";
+    default:
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Not support index `%d` when construct OpMetaInfoBuilder, "
+          "now only support `0, 1, 2`.",
+          index_));
+  }
   auto op_meta = OpMetaInfo(name_);
   info_vector.emplace_back(std::move(op_meta));
+  // 3. get current info ptr
   info_ptr_ = &(info_vector.back());
 }
 
@@ -93,24 +121,27 @@ OpMetaInfoBuilder& OpMetaInfoBuilder::SetKernelFn(KernelFunc func) {
 }
 
 OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferShapeFn(InferShapeFunc func) {
+  PADDLE_ENFORCE_EQ(
+      index_, 0UL,
+      platform::errors::Unimplemented(
+          "Currently, the InferShapeFn setting of Grad Op is not supported, "
+          "And backward Tensor `X@GRAD` will use the shape of forward Tensor "
+          "`X` by default."));
   info_ptr_->SetInferShapeFn(std::forward<InferShapeFunc>(func));
   return *this;
 }
 
 OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferDtypeFn(InferDtypeFunc func) {
+  PADDLE_ENFORCE_EQ(
+      index_, 0UL,
+      platform::errors::Unimplemented(
+          "Currently, the InferDtypeFn setting of Grad Op is not supported, "
+          "And backward Tensor `X@GRAD` will use the dtype of forward Tensor "
+          "`X` by default."));
   info_ptr_->SetInferDtypeFn(std::forward<InferDtypeFunc>(func));
   return *this;
 }
 
-OpMetaInfoBuilder& OpMetaInfoBuilder::SetBackwardOp(
-    const std::string& bwd_op_name) {
-  auto& info_vector = OpMetaInfoMap::Instance()[name_];
-  auto op_meta = OpMetaInfo(bwd_op_name);
-  info_vector.emplace_back(std::move(op_meta));
-  info_ptr_ = &(info_vector.back());
-  return *this;
-}
-
 /////////////////////// Op register API /////////////////////////
 
 void RegisterAllCustomOperator() {
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 03a8cc366e7f2e8bb3baa2dd65ee609533cb8137..90831afc9ba89185dbe85dbf54bb38ea3ffbace6 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -153,12 +153,21 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
   }
 
   VLOG(1) << "Run ComputeFunc.";
-  auto outs = func(custom_ins, custom_attrs);
+  try {
+    auto outs = func(custom_ins, custom_attrs);
 
-  VLOG(1) << "Custom Operator: Share outputs into ExecutionContext.";
-  for (size_t i = 0; i < outputs.size(); ++i) {
-    auto* true_out = ctx.Output<Tensor>(outputs[i]);
-    CustomTensorUtils::ShareDataTo(outs.at(i), true_out);
+    VLOG(1) << "Custom Operator: Share outputs into ExecutionContext.";
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      auto* true_out = ctx.Output<Tensor>(outputs[i]);
+      CustomTensorUtils::ShareDataTo(outs.at(i), true_out);
+    }
+  } catch (platform::EnforceNotMet& exception) {
+    throw std::move(exception);
+  } catch (std::exception& ex) {
+    PADDLE_THROW(platform::errors::External("%s", ex.what()));
+  } catch (...) {
+    PADDLE_THROW(platform::errors::Fatal(
+        "Custom operator raises an unknown exception in rumtime."));
   }
 }
 
@@ -475,58 +484,108 @@ void RegisterOperatorWithMetaInfo(
           op_name, info.proto_->InitializationErrorString()));
 
   // InferShape
-  PADDLE_ENFORCE_NOT_NULL(
-      infer_shape_func,
-      platform::errors::PreconditionNotMet(
-          "InferShapeFn is nullptr. Need to set the InferShapeFn of custom "
-          "operator by .SetInferShapeFn(PD_INFER_SHAPE(...))"));
-  info.infer_shape_ = [op_inputs, op_outputs,
-                       infer_shape_func](InferShapeContext* ctx) {
-    std::vector<std::vector<int64_t>> input_shapes;
-
-    VLOG(1) << "Custom Operator: InferShape - get input ddim.";
-    for (auto& in_name : op_inputs) {
-      OP_INOUT_CHECK(ctx->HasInput(in_name), "Input", in_name, "Custom");
-      auto ddim = ctx->GetInputDim(in_name);
-      input_shapes.emplace_back(framework::vectorize(ddim));
-    }
+  if (infer_shape_func == nullptr) {
+    // use default InferShape
+    info.infer_shape_ = [op_inputs, op_outputs](InferShapeContext* ctx) {
+      PADDLE_ENFORCE_EQ(
+          op_inputs.size(), 1UL,
+          platform::errors::Unavailable(
+              "Your custom operator contains multiple inputs. "
+              "We only allow a custom operator that contains only one input "
+              "and "
+              "only one output without setting the InferShapeFn. At this time, "
+              "the input shape will be directly set to the output shape.\n"
+              "Please set the InferShapeFn of custom "
+              "operator by .SetInferShapeFn(PD_INFER_SHAPE(...))"));
+      PADDLE_ENFORCE_EQ(
+          op_outputs.size(), 1UL,
+          platform::errors::Unavailable(
+              "Your custom operator contains multiple outputs. "
+              "We only allow a custom operator that contains only one input "
+              "and "
+              "only one output without setting the InferShapeFn. At this time, "
+              "the input shape will be directly set to the output shape.\n"
+              "Please set the InferShapeFn of custom "
+              "operator by .SetInferShapeFn(PD_INFER_SHAPE(...))"));
+
+      VLOG(1) << "Custom Operator: Default InferShape - share ddim.";
+      ctx->ShareDim(op_inputs[0], op_outputs[0]);
+    };
+  } else {
+    info.infer_shape_ = [op_inputs, op_outputs,
+                         infer_shape_func](InferShapeContext* ctx) {
+      std::vector<std::vector<int64_t>> input_shapes;
+
+      VLOG(1) << "Custom Operator: InferShape - get input ddim.";
+      for (auto& in_name : op_inputs) {
+        OP_INOUT_CHECK(ctx->HasInput(in_name), "Input", in_name, "Custom");
+        auto ddim = ctx->GetInputDim(in_name);
+        input_shapes.emplace_back(framework::vectorize(ddim));
+      }
 
-    VLOG(1) << "Custom Operator: InferShape - calc output ddim.";
-    auto output_shapes = infer_shape_func(input_shapes);
+      VLOG(1) << "Custom Operator: InferShape - calc output ddim.";
+      auto output_shapes = infer_shape_func(input_shapes);
 
-    VLOG(1) << "Custom Operator: InferShape - set output ddim.";
-    for (size_t i = 0; i < op_outputs.size(); ++i) {
-      ctx->SetOutputDim(op_outputs[i], framework::make_ddim(output_shapes[i]));
-    }
-  };
+      VLOG(1) << "Custom Operator: InferShape - set output ddim.";
+      for (size_t i = 0; i < op_outputs.size(); ++i) {
+        ctx->SetOutputDim(op_outputs[i],
+                          framework::make_ddim(output_shapes[i]));
+      }
+    };
+  }
 
   // Infer Dtype
-  PADDLE_ENFORCE_NOT_NULL(
-      infer_dtype_func,
-      platform::errors::PreconditionNotMet(
-          "InferDtypeFn is nullptr. Need to set the InferDtypeFn of custom "
-          "operator by .SetInferDtypeFn(PD_INFER_DTYPE(...))"));
-  info.infer_var_type_ = [op_inputs, op_outputs,
-                          infer_dtype_func](InferVarTypeContext* ctx) {
-    std::vector<DataType> input_dtypes;
-
-    VLOG(1) << "Custom Operator: InferDtype - get input dtype.";
-    for (auto& in_name : op_inputs) {
-      auto dtype = ctx->GetInputDataType(in_name);
-      input_dtypes.emplace_back(
-          CustomTensorUtils::ConvertInnerDTypeToEnumDType(dtype));
-    }
+  if (infer_dtype_func == nullptr) {
+    // use defalut InferDtype
+    info.infer_var_type_ = [op_inputs, op_outputs](InferVarTypeContext* ctx) {
+      PADDLE_ENFORCE_EQ(
+          op_inputs.size(), 1UL,
+          platform::errors::Unavailable(
+              "Your custom operator contains multiple inputs. "
+              "We only allow a custom operator that contains only one input "
+              "and "
+              "only one output without setting the InferDtypeFn. At this time, "
+              "the input dtype will be directly set to the output dtype.\n"
+              "Please set the InferDtypeFn of custom "
+              "operator by .SetInferDtypeFn(PD_INFER_DTYPE(...))"));
+      PADDLE_ENFORCE_EQ(
+          op_outputs.size(), 1UL,
+          platform::errors::Unavailable(
+              "Your custom operator contains multiple outputs. "
+              "We only allow a custom operator that contains only one input "
+              "and "
+              "only one output without setting the InferDtypeFn. At this time, "
+              "the input dtype will be directly set to the output dtype.\n"
+              "Please set the InferDtypeFn of custom "
+              "operator by .SetInferDtypeFn(PD_INFER_DTYPE(...))"));
+
+      VLOG(1) << "Custom Operator: InferDtype - share dtype.";
+      auto dtype = ctx->GetInputDataType(op_inputs[0]);
+      ctx->SetOutputDataType(op_outputs[0], dtype);
+    };
+  } else {
+    info.infer_var_type_ = [op_inputs, op_outputs,
+                            infer_dtype_func](InferVarTypeContext* ctx) {
+      std::vector<DataType> input_dtypes;
+
+      VLOG(1) << "Custom Operator: InferDtype - get input dtype.";
+      for (auto& in_name : op_inputs) {
+        auto dtype = ctx->GetInputDataType(in_name);
+        input_dtypes.emplace_back(
+            CustomTensorUtils::ConvertInnerDTypeToEnumDType(dtype));
+      }
 
-    VLOG(1) << "Custom Operator: InferDtype - infer output dtype.";
-    auto output_dtypes = infer_dtype_func(input_dtypes);
+      VLOG(1) << "Custom Operator: InferDtype - infer output dtype.";
+      auto output_dtypes = infer_dtype_func(input_dtypes);
 
-    VLOG(1) << "Custom Operator: InferDtype - set output dtype.";
-    for (size_t i = 0; i < op_outputs.size(); ++i) {
-      ctx->SetOutputDataType(
-          op_outputs[i],
-          CustomTensorUtils::ConvertEnumDTypeToInnerDType(output_dtypes[i]));
-    }
-  };
+      VLOG(1) << "Custom Operator: InferDtype - set output dtype.";
+      for (size_t i = 0; i < op_outputs.size(); ++i) {
+        ctx->SetOutputDataType(
+            op_outputs[i],
+            CustomTensorUtils::ConvertEnumDTypeToInnerDType(output_dtypes[i]));
+      }
+    };
+  }
 
   // Kernel func
   RegisterOperatorKernel(op_name, kernel_fn, op_inputs, op_outputs, op_attrs);
diff --git a/python/paddle/fluid/tests/custom_op/CMakeLists.txt b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
index 3f85f4ef50a223949ef60678b61e97be29aea471..7f94da43535589d75a0e2312c351c13c16f27146 100644
--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
@@ -3,10 +3,12 @@ if(WITH_GPU)
     # 'test_custom_relu_op_setup/jit' compile .cc and .cu file
     py_test(test_custom_relu_op_setup SRCS test_custom_relu_op_setup.py)
     py_test(test_custom_relu_op_jit SRCS test_custom_relu_op_jit.py)
+    py_test(test_custom_relu_model SRCS test_custom_relu_model.py)
 
     # Compiling shared library will cost some time, but running process is very fast.
     set_tests_properties(test_custom_relu_op_setup PROPERTIES TIMEOUT 250)
     set_tests_properties(test_custom_relu_op_jit PROPERTIES TIMEOUT 180)
+    set_tests_properties(test_custom_relu_model PROPERTIES TIMEOUT 180)
 endif()
 
 py_test(test_sysconfig SRCS test_sysconfig.py)
diff --git a/python/paddle/fluid/tests/custom_op/attr_test_op.cc b/python/paddle/fluid/tests/custom_op/attr_test_op.cc
index 474d3d2d4e2b3b566620a11d41564fb662bd35e3..97aae10613734948e65a76b6854bfffe9bed45a7 100644
--- a/python/paddle/fluid/tests/custom_op/attr_test_op.cc
+++ b/python/paddle/fluid/tests/custom_op/attr_test_op.cc
@@ -150,15 +150,7 @@ std::vector<paddle::Tensor> AttrTestBackward(
   return {grad_x};
 }
 
-std::vector<std::vector<int64_t>> InferShape(std::vector<int64_t> x_shape) {
-  return {x_shape};
-}
-
-std::vector<paddle::DataType> InferDType(paddle::DataType x_dtype) {
-  return {x_dtype};
-}
-
-PD_BUILD_OP("attr_test")
+PD_BUILD_OP(attr_test)
     .Inputs({"X"})
     .Outputs({"Out"})
     .Attrs({"bool_attr: bool",
@@ -170,10 +162,9 @@ PD_BUILD_OP("attr_test")
             "float_vec_attr: std::vector<float>",
             "int64_vec_attr: std::vector<int64_t>",
             "str_vec_attr: std::vector<std::string>"})
-    .SetKernelFn(PD_KERNEL(AttrTestForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(InferDType))
-    .SetBackwardOp("attr_test_grad")
+    .SetKernelFn(PD_KERNEL(AttrTestForward));
+
+PD_BUILD_GRAD_OP(attr_test)
     .Inputs({paddle::Grad("Out")})
     .Outputs({paddle::Grad("X")})
     .Attrs({"int_attr: int",
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
index 0e358e24ae3e814b3fd21d010c478812aa0b8340..4b8d3bca63695c0abfb5dafc0423e292fd157c07 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
@@ -96,21 +96,12 @@ std::vector<paddle::Tensor> ReluBackward(const paddle::Tensor& x,
   }
 }
 
-std::vector<std::vector<int64_t>> ReluInferShape(std::vector<int64_t> x_shape) {
-  return {x_shape};
-}
-
-std::vector<paddle::DataType> ReluInferDType(paddle::DataType x_dtype) {
-  return {x_dtype};
-}
-
-PD_BUILD_OP("custom_relu")
+PD_BUILD_OP(custom_relu)
     .Inputs({"X"})
     .Outputs({"Out"})
-    .SetKernelFn(PD_KERNEL(ReluForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(ReluInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(ReluInferDType))
-    .SetBackwardOp("relu2_grad")
+    .SetKernelFn(PD_KERNEL(ReluForward));
+
+PD_BUILD_GRAD_OP(custom_relu)
     .Inputs({"X", "Out", paddle::Grad("Out")})
     .Outputs({paddle::Grad("X")})
     .SetKernelFn(PD_KERNEL(ReluBackward));
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op_dup.cc b/python/paddle/fluid/tests/custom_op/custom_relu_op_dup.cc
index 7319bdd76264508ef485ba80382aac6dcbbeb4b6..89d14bfa049603dd7a97b6d374dfa44c227728e2 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op_dup.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op_dup.cc
@@ -25,19 +25,14 @@ std::vector<paddle::Tensor> ReluBackward(const paddle::Tensor& x,
                                          const paddle::Tensor& out,
                                          const paddle::Tensor& grad_out);
 
-std::vector<std::vector<int64_t>> ReluInferShape(std::vector<int64_t> x_shape);
-
-std::vector<paddle::DataType> ReluInferDType(paddle::DataType x_dtype);
-
 // Reuse codes in `custom_relu_op.cc/cu` to register another custom operator
 // to test jointly compile multi operators at same time.
-PD_BUILD_OP("custom_relu_dup")
+PD_BUILD_OP(custom_relu_dup)
     .Inputs({"X"})
     .Outputs({"Out"})
-    .SetKernelFn(PD_KERNEL(ReluForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(ReluInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(ReluInferDType))
-    .SetBackwardOp("relu3_grad")
+    .SetKernelFn(PD_KERNEL(ReluForward));
+
+PD_BUILD_GRAD_OP(custom_relu_dup)
     .Inputs({"X", "Out", paddle::Grad("Out")})
     .Outputs({paddle::Grad("X")})
     .SetKernelFn(PD_KERNEL(ReluBackward));
diff --git a/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
index e09ac2f87c80639ac92467e7b50094e7752d500f..720be8b4e377b555ba202fb4f81ee3f7be54e265 100644
--- a/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
+++ b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
@@ -26,14 +26,6 @@ void assign_cpu_kernel(const data_t* x_data,
   }
 }
 
-std::vector<std::vector<int64_t>> InferShape(std::vector<int64_t> x_shape) {
-  return {x_shape};
-}
-
-std::vector<paddle::DataType> InferDType(paddle::DataType x_dtype) {
-  return {x_dtype};
-}
-
 std::vector<paddle::Tensor> DispatchTestInterger(const paddle::Tensor& x) {
   auto out = paddle::Tensor(paddle::PlaceType::kCPU);
   out.reshape(x.shape());
@@ -47,12 +39,10 @@ std::vector<paddle::Tensor> DispatchTestInterger(const paddle::Tensor& x) {
   return {out};
 }
 
-PD_BUILD_OP("dispatch_test_integer")
+PD_BUILD_OP(dispatch_test_integer)
     .Inputs({"X"})
     .Outputs({"Out"})
-    .SetKernelFn(PD_KERNEL(DispatchTestInterger))
-    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(InferDType));
+    .SetKernelFn(PD_KERNEL(DispatchTestInterger));
 
 std::vector<paddle::Tensor> DispatchTestComplex(const paddle::Tensor& x) {
   auto out = paddle::Tensor(paddle::PlaceType::kCPU);
@@ -67,12 +57,10 @@ std::vector<paddle::Tensor> DispatchTestComplex(const paddle::Tensor& x) {
   return {out};
 }
 
-PD_BUILD_OP("dispatch_test_complex")
+PD_BUILD_OP(dispatch_test_complex)
     .Inputs({"X"})
     .Outputs({"Out"})
-    .SetKernelFn(PD_KERNEL(DispatchTestComplex))
-    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(InferDType));
+    .SetKernelFn(PD_KERNEL(DispatchTestComplex));
 
 std::vector<paddle::Tensor> DispatchTestFloatAndInteger(
     const paddle::Tensor& x) {
@@ -88,12 +76,10 @@ std::vector<paddle::Tensor> DispatchTestFloatAndInteger(
   return {out};
 }
 
-PD_BUILD_OP("dispatch_test_float_and_integer")
+PD_BUILD_OP(dispatch_test_float_and_integer)
     .Inputs({"X"})
     .Outputs({"Out"})
-    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndInteger))
-    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(InferDType));
+    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndInteger));
 
 std::vector<paddle::Tensor> DispatchTestFloatAndComplex(
     const paddle::Tensor& x) {
@@ -109,12 +95,10 @@ std::vector<paddle::Tensor> DispatchTestFloatAndComplex(
   return {out};
 }
 
-PD_BUILD_OP("dispatch_test_float_and_complex")
+PD_BUILD_OP(dispatch_test_float_and_complex)
     .Inputs({"X"})
     .Outputs({"Out"})
-    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndComplex))
-    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(InferDType));
+    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndComplex));
 
 std::vector<paddle::Tensor> DispatchTestFloatAndIntegerAndComplex(
     const paddle::Tensor& x) {
@@ -130,9 +114,7 @@ std::vector<paddle::Tensor> DispatchTestFloatAndIntegerAndComplex(
   return {out};
 }
 
-PD_BUILD_OP("dispatch_test_float_and_integer_and_complex")
+PD_BUILD_OP(dispatch_test_float_and_integer_and_complex)
     .Inputs({"X"})
     .Outputs({"Out"})
-    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndIntegerAndComplex))
-    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(InferDType));
+    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndIntegerAndComplex));
diff --git a/python/paddle/fluid/tests/custom_op/multi_out_test_op.cc b/python/paddle/fluid/tests/custom_op/multi_out_test_op.cc
index bece0f49845a5ae3fd006ccf383adb78f043bd4b..17a36df2cde48598efb3945499516c6c70edecfd 100644
--- a/python/paddle/fluid/tests/custom_op/multi_out_test_op.cc
+++ b/python/paddle/fluid/tests/custom_op/multi_out_test_op.cc
@@ -68,7 +68,7 @@ std::vector<paddle::DataType> InferDtype(paddle::DataType x_dtype) {
   return {x_dtype, paddle::DataType::FLOAT64, paddle::DataType::INT32};
 }
 
-PD_BUILD_OP("multi_out")
+PD_BUILD_OP(multi_out)
     .Inputs({"X"})
     .Outputs({"Out", "Fake_float64", "ZFake_int32"})
     .SetKernelFn(PD_KERNEL(MultiOutCPU))
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..205204168859ad39b59fcaca6524f98a3d09330c
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
@@ -0,0 +1,318 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+
+import paddle
+from paddle import nn
+from paddle.utils.cpp_extension import load, get_build_directory
+from paddle.utils.cpp_extension.extension_utils import run_cmd
+
+from utils import paddle_includes, extra_compile_args
+
+# Because Windows don't use docker, the shared lib already exists in the 
+# cache dir, it will not be compiled again unless the shared lib is removed.
+file = '{}\\custom_relu_for_model_jit\\custom_relu_for_model_jit.pyd'.format(
+    get_build_directory())
+if os.name == 'nt' and os.path.isfile(file):
+    cmd = 'del {}'.format(file)
+    run_cmd(cmd, True)
+
+# Compile and load custom op Just-In-Time.
+# custom_relu_op_dup.cc is only used for multi ops test,
+# not a new op, if you want to test only one op, remove this
+# source file
+custom_module = load(
+    name='custom_relu_for_model_jit',
+    sources=['custom_relu_op.cc', 'custom_relu_op.cu'],
+    extra_include_paths=paddle_includes,  # add for Coverage CI
+    extra_cxx_cflags=extra_compile_args,  # add for Coverage CI
+    extra_cuda_cflags=extra_compile_args,  # add for Coverage CI
+    verbose=True)
+
+
+class Net(nn.Layer):
+    """
+    A simple exmaple for Regression Model.
+    """
+
+    def __init__(self, in_dim, out_dim, use_custom_op=False):
+        super(Net, self).__init__()
+        self.fc1 = nn.Linear(in_dim, in_dim)
+        self.fc2 = nn.Linear(in_dim, out_dim)
+        self.relu_act = custom_module.custom_relu if use_custom_op else nn.functional.relu
+
+    def forward(self, x):
+        out = self.fc1(x)
+        out = self.relu_act(out)
+        out = self.fc2(out)
+        out = self.relu_act(out)
+
+        out = paddle.mean(out, axis=-1)
+
+        return out
+
+
+class TestDygraphModel(unittest.TestCase):
+    def setUp(self):
+
+        self.seed = 2021
+        self.in_dim = 10
+        self.out_dim = 64
+        self.batch_num = 10
+        self.batch_size = 4
+        self.datas = [
+            np.random.uniform(
+                size=[self.batch_size, self.in_dim]).astype('float32')
+            for i in range(self.batch_num)
+        ]
+        self.labels = [
+            np.random.uniform(size=[self.batch_size, 1]).astype('float32')
+            for i in range(self.batch_num)
+        ]
+
+        self.devices = ['cpu', 'gpu']
+
+        # for saving model
+        self.model_path_template = "infer_model/custom_relu_dygaph_model_{}.pdparams"
+        self.model_dy2stat_path = "infer_model/custom_relu_model_dy2sta"
+
+        # for dy2stat
+        self.x_spec = paddle.static.InputSpec(
+            shape=[None, self.in_dim], dtype='float32', name='x')
+
+    def test_train_eval(self):
+        for device in self.devices:
+            # set device
+            paddle.set_device(device)
+
+            # for train
+            origin_relu_train_out = self.train_model(use_custom_op=False)
+            custom_relu_train_out = self.train_model(use_custom_op=True)
+            custom_relu_dy2stat_train_out = self.train_model(
+                use_custom_op=True, dy2stat=True)  # for to_static
+
+            self.assertTrue(
+                np.array_equal(origin_relu_train_out, custom_relu_train_out))
+            self.assertTrue(
+                np.array_equal(origin_relu_train_out,
+                               custom_relu_dy2stat_train_out))
+
+            # for eval
+            origin_relu_eval_out = self.eval_model(use_custom_op=False)
+            custom_relu_eval_out = self.eval_model(use_custom_op=True)
+            custom_relu_dy2stat_eval_out = self.eval_model(
+                use_custom_op=True, dy2stat=True)  # for to_static
+
+            self.assertTrue(
+                np.array_equal(origin_relu_eval_out, custom_relu_eval_out))
+            self.assertTrue(
+                np.array_equal(origin_relu_eval_out,
+                               custom_relu_dy2stat_eval_out))
+
+    def train_model(self, use_custom_op=False, dy2stat=False):
+        # reset random seed
+        paddle.seed(self.seed)
+        np.random.seed(self.seed)
+        # paddle.framework.random._manual_program_seed(SEED)
+
+        net = Net(self.in_dim, self.out_dim, use_custom_op)
+        if dy2stat:
+            net = paddle.jit.to_static(net, input_spec=[self.x_spec])
+        mse_loss = paddle.nn.MSELoss()
+        sgd = paddle.optimizer.SGD(learning_rate=0.1,
+                                   parameters=net.parameters())
+
+        for batch_id in range(self.batch_num):
+            x = paddle.to_tensor(self.datas[batch_id])
+            y = paddle.to_tensor(self.labels[batch_id])
+
+            out = net(x)
+            loss = mse_loss(out, y)
+
+            loss.backward()
+            sgd.minimize(loss)
+            net.clear_gradients()
+
+        # save inference model
+        net.eval()
+        if dy2stat:
+            paddle.jit.save(net, self.model_dy2stat_path)
+        else:
+            paddle.save(net.state_dict(),
+                        self.model_path_template.format(use_custom_op))
+
+        return out.numpy()
+
+    def eval_model(self, use_custom_op=False, dy2stat=False):
+        net = Net(self.in_dim, self.out_dim, use_custom_op)
+
+        if dy2stat:
+            net = paddle.jit.load(self.model_dy2stat_path)
+        else:
+            state_dict = paddle.load(
+                self.model_path_template.format(use_custom_op))
+            net.set_state_dict(state_dict)
+
+        sample_x = paddle.to_tensor(self.datas[0])
+        net.eval()
+        out = net(sample_x)
+
+        return out.numpy()
+
+
+class TestStaticModel(unittest.TestCase):
+    def setUp(self):
+        self.seed = 2021
+        self.in_dim = 10
+        self.out_dim = 64
+        self.batch_num = 10
+        self.batch_size = 8
+        self.datas = [
+            np.random.uniform(
+                size=[self.batch_size, self.in_dim]).astype('float32')
+            for i in range(self.batch_num)
+        ]
+        self.labels = [
+            np.random.uniform(size=[self.batch_size, 1]).astype('float32')
+            for i in range(self.batch_num)
+        ]
+
+        self.devices = ['cpu', 'gpu']
+
+        # for saving model
+        self.model_path_template = "infer_model/custom_relu_static_model_{}_{}"
+
+        paddle.enable_static()
+
+    def tearDown(self):
+        paddle.disable_static()
+
+    def test_train_eval(self):
+        for device in self.devices:
+            # for train
+            original_relu_train_out = self.train_model(
+                device, use_custom_op=False)
+            custom_relu_train_out = self.train_model(device, use_custom_op=True)
+            # using PE
+            original_relu_train_pe_out = self.train_model(
+                device, use_custom_op=False, use_pe=True)
+            custom_relu_train_pe_out = self.train_model(
+                device, use_custom_op=True, use_pe=True)
+            print(original_relu_train_out)
+            print(custom_relu_train_out)
+            print(original_relu_train_pe_out)
+            print(custom_relu_train_pe_out)
+
+            self.assertTrue(
+                np.array_equal(original_relu_train_out, custom_relu_train_out))
+            self.assertTrue(
+                np.array_equal(original_relu_train_pe_out,
+                               custom_relu_train_pe_out))
+
+            # for eval
+            original_relu_eval_out = self.eval_model(
+                device, use_custom_op=False)
+            custom_relu_eval_out = self.eval_model(device, use_custom_op=True)
+            # using PE
+            original_relu_eval_pe_out = self.eval_model(
+                device, use_custom_op=False, use_pe=True)
+            custom_relu_eval_pe_out = self.eval_model(
+                device, use_custom_op=True, use_pe=True)
+            print(original_relu_eval_out)
+            print(custom_relu_eval_out)
+            print(original_relu_eval_pe_out)
+            print(custom_relu_eval_pe_out)
+
+            self.assertTrue(
+                np.array_equal(original_relu_eval_out, custom_relu_eval_out))
+            self.assertTrue(
+                np.array_equal(original_relu_eval_pe_out,
+                               custom_relu_eval_pe_out))
+
+    def train_model(self, device, use_custom_op=False, use_pe=False):
+        # reset random seed
+        paddle.seed(self.seed)
+        np.random.seed(self.seed)
+        # set device
+        paddle.set_device(device)
+
+        with paddle.static.scope_guard(paddle.static.Scope()):
+            with paddle.static.program_guard(paddle.static.Program()):
+                x = paddle.static.data(
+                    shape=[None, self.in_dim], name='x', dtype='float32')
+                y = paddle.static.data(
+                    shape=[None, 1], name='y', dtype='float32')
+
+                net = Net(self.in_dim, self.out_dim, use_custom_op)
+                out = net(x)
+
+                loss = nn.functional.mse_loss(out, y)
+                sgd = paddle.optimizer.SGD(learning_rate=0.01)
+                sgd.minimize(loss)
+
+                exe = exe = paddle.static.Executor()
+                exe.run(paddle.static.default_startup_program())
+
+                # For PE
+                if use_pe:
+                    places = paddle.static.cpu_places(
+                    ) if device is 'cpu' else paddle.static.cuda_places()
+                    main_program = paddle.static.CompiledProgram(
+                        paddle.static.default_main_program(
+                        )).with_data_parallel(
+                            loss_name=loss.name, places=places)
+                else:
+                    main_program = paddle.static.default_main_program()
+
+                for batch_id in range(self.batch_num):
+                    x_data = self.datas[batch_id]
+                    y_data = self.labels[batch_id]
+
+                    res = exe.run(main_program,
+                                  feed={'x': x_data,
+                                        'y': y_data},
+                                  fetch_list=[out])
+
+                # save model
+                paddle.static.save_inference_model(
+                    self.model_path_template.format(use_custom_op, use_pe),
+                    [x], [out], exe)
+
+                return res[0]
+
+    def eval_model(self, device, use_custom_op=False, use_pe=False):
+        paddle.set_device(device)
+
+        with paddle.static.scope_guard(paddle.static.Scope()):
+            with paddle.static.program_guard(paddle.static.Program()):
+                exe = paddle.static.Executor()
+
+                [inference_program, feed_target_names,
+                 fetch_targets] = paddle.static.load_inference_model(
+                     self.model_path_template.format(use_custom_op, use_pe),
+                     exe)
+
+                x_data = self.datas[0]
+                results = exe.run(inference_program,
+                                  feed={feed_target_names[0]: x_data},
+                                  fetch_list=fetch_targets)
+
+                return results[0]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py
index 57bcea658b53c400234afd22d4d5acc77f7f43ce..5d132217bba91f84924cbff9f2bd951d381326cf 100644
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -22,11 +22,14 @@ from setuptools.command.easy_install import easy_install
 from setuptools.command.build_ext import build_ext
 from distutils.command.build import build
 
-from .extension_utils import find_cuda_home, normalize_extension_kwargs, add_compile_flag, bootstrap_context
-from .extension_utils import is_cuda_file, prepare_unix_cudaflags, prepare_win_cudaflags, add_std_without_repeat, get_build_directory
-from .extension_utils import _import_module_from_library, CustomOpInfo, _write_setup_file, _jit_compile, parse_op_name_from
-from .extension_utils import check_abi_compatibility, log_v, IS_WINDOWS, OS_NAME
-from .extension_utils import use_new_custom_op_load_method, MSVC_COMPILE_FLAGS
+from .extension_utils import find_cuda_home, normalize_extension_kwargs, add_compile_flag
+from .extension_utils import is_cuda_file, prepare_unix_cudaflags, prepare_win_cudaflags
+from .extension_utils import _import_module_from_library, _write_setup_file, _jit_compile
+from .extension_utils import check_abi_compatibility, log_v, CustomOpInfo, parse_op_name_from
+from .extension_utils import use_new_custom_op_load_method, clean_object_if_change_cflags
+from .extension_utils import bootstrap_context, get_build_directory, add_std_without_repeat
+
+from .extension_utils import IS_WINDOWS, OS_NAME, MSVC_COMPILE_FLAGS, MSVC_COMPILE_FLAGS
 
 # Note(zhouwei): On windows, it will export function 'PyInit_[name]' by default,
 # The solution is: 1.User add function PyInit_[name] 2. set not to export
@@ -357,6 +360,13 @@ class BuildExtension(build_ext, object):
     def build_extensions(self):
         self._check_abi()
 
+        # Note(Aurelius84): If already compiling source before, we should check whether
+        # cflags have changed and delete the built shared library to re-compile the source
+        # even though source file content keep unchanaged.
+        so_name = self.get_ext_fullpath(self.extensions[0].name)
+        clean_object_if_change_cflags(
+            os.path.abspath(so_name), self.extensions[0])
+
         # Consider .cu, .cu.cc as valid source extensions.
         self.compiler.src_extensions += ['.cu', '.cu.cc']
         # Save the original _compile method for later.
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index db2da5574854c27267dd568d4eed1432acd5353f..df953763a66ed34e78952e4a22f55a8d93c5c2fc 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -16,7 +16,9 @@ import os
 import re
 import six
 import sys
+import json
 import glob
+import hashlib
 import logging
 import collections
 import textwrap
@@ -219,6 +221,106 @@ class CustomOpInfo:
         return next(reversed(self.op_info_map.items()))
 
 
+VersionFields = collections.namedtuple('VersionFields', [
+    'sources',
+    'extra_compile_args',
+    'extra_link_args',
+    'library_dirs',
+    'runtime_library_dirs',
+    'include_dirs',
+    'define_macros',
+    'undef_macros',
+])
+
+
+class VersionManager:
+    def __init__(self, version_field):
+        self.version_field = version_field
+        self.version = self.hasher(version_field)
+
+    def hasher(self, version_field):
+        from paddle.fluid.layers.utils import flatten
+
+        md5 = hashlib.md5()
+        for field in version_field._fields:
+            elem = getattr(version_field, field)
+            if not elem: continue
+            if isinstance(elem, (list, tuple, dict)):
+                flat_elem = flatten(elem)
+                md5 = combine_hash(md5, tuple(flat_elem))
+            else:
+                raise RuntimeError(
+                    "Support types with list, tuple and dict, but received {} with {}.".
+                    format(type(elem), elem))
+
+        return md5.hexdigest()
+
+    @property
+    def details(self):
+        return self.version_field._asdict()
+
+
+def combine_hash(md5, value):
+    """
+    Return new hash value.
+    DO NOT use `hash()` beacuse it doesn't generate stable value between different process.
+    See https://stackoverflow.com/questions/27522626/hash-function-in-python-3-3-returns-different-results-between-sessions
+    """
+    md5.update(repr(value).encode())
+    return md5
+
+
+def clean_object_if_change_cflags(so_path, extension):
+    """
+    If already compiling source before, we should check whether cflags 
+    have changed and delete the built object to re-compile the source
+    even though source file content keeps unchanaged.
+    """
+
+    def serialize(path, version_info):
+        assert isinstance(version_info, dict)
+        with open(path, 'w') as f:
+            f.write(json.dumps(version_info, indent=4, sort_keys=True))
+
+    def deserialize(path):
+        assert os.path.exists(path)
+        with open(path, 'r') as f:
+            content = f.read()
+            return json.loads(content)
+
+    # version file
+    VERSION_FILE = "version.txt"
+    base_dir = os.path.dirname(so_path)
+    so_name = os.path.basename(so_path)
+    version_file = os.path.join(base_dir, VERSION_FILE)
+
+    # version info
+    args = [getattr(extension, field, None) for field in VersionFields._fields]
+    version_field = VersionFields._make(args)
+    versioner = VersionManager(version_field)
+
+    if os.path.exists(so_path) and os.path.exists(version_file):
+        old_version_info = deserialize(version_file)
+        so_version = old_version_info.get(so_name, None)
+        # delete shared library file if versison is changed to re-compile it.
+        if so_version is not None and so_version != versioner.version:
+            log_v(
+                "Re-Compiling {}, because specified cflags have been changed. New signature {} has been saved into {}.".
+                format(so_name, versioner.version, version_file))
+            os.remove(so_path)
+            # upate new version information
+            new_version_info = versioner.details
+            new_version_info[so_name] = versioner.version
+            serialize(version_file, new_version_info)
+    else:
+        # If compile at first time, save compiling detail information for debug.
+        if not os.path.exists(base_dir):
+            os.makedirs(base_dir)
+        details = versioner.details
+        details[so_name] = versioner.version
+        serialize(version_file, details)
+
+
 def prepare_unix_cudaflags(cflags):
     """
     Prepare all necessary compiled flags for nvcc compiling CUDA files.