Delete Ref & VectorRef and add GetDataSafely (#22997)

* delete invalid check inferface Ref & VectorRef, test=develop * fix vector ref delete error, test=develop * try the new check inferface, test=develop * change all related code with new check macro, test=develop * remove static assert, test=develop * polish detail, test=develop * skip coverage problem, test=develop * add new check macro, test=develop

Delete Ref & VectorRef and add GetDataSafely (#22997)
* delete invalid check inferface Ref & VectorRef, test=develop * fix vector ref delete error, test=develop * try the new check inferface, test=develop * change all related code with new check macro, test=develop * remove static assert, test=develop * polish detail, test=develop * skip coverage problem, test=develop * add new check macro, test=develop
16315d3d · Chen Weihang · GitHub · 4c675a45 · 16315d3d · 16315d3d
37 changed file
--- a/paddle/fluid/operators/activation_cudnn.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn.cu.cc
@@ -31,8 +31,8 @@ class CudnnActivationKernel
    ExtractActivationTensor(context, X, Out);
    ActivationDescriptor act_desc;
    TensorDescriptor x_desc, out_desc;
-    x_desc.set(detail::Ref(X));
-    out_desc.set(detail::Ref(Out));
+    x_desc.set(GET_DATA_SAFELY(X, "Input", "X", "CudnnActivation"));
+    out_desc.set(GET_DATA_SAFELY(Out, "Output", "Out", "CudnnActivation");
  }
 };


--- a/paddle/fluid/operators/activation_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc
@@ -37,7 +37,7 @@ struct CudnnActivationFunctor {
    act_desc.set(mode_, coef_);
    TensorDescriptor x_desc, out_desc;
    x_desc.set(x);
-    out_desc.set(detail::Ref(out));
+    out_desc.set(GET_DATA_SAFELY(out, "Output", "Out", "CudnnActivation"));
    PADDLE_ENFORCE(platform::dynload::cudnnActivationForward(
        ctx_.cudnn_handle(), act_desc.desc(),
        platform::CudnnDataType<T>::kOne(), x_desc.desc(), x.data<T>(),
@@ -63,7 +63,7 @@ struct CudnnActivationGradFunctor {
    x_desc.set(x);
    out_desc.set(out);
    dout_desc.set(dout);
-    dx_desc.set(detail::Ref(dx));
+    dx_desc.set(GET_DATA_SAFELY(dx, "Output", "X@GRAD", "CudnnActivationGrad"));
    PADDLE_ENFORCE(platform::dynload::cudnnActivationBackward(
        ctx_.cudnn_handle(), act_desc.desc(),
        platform::CudnnDataType<T>::kOne(), out_desc.desc(), out.data<T>(),
@@ -141,7 +141,7 @@ class CudnnActivationKernel
    Out->mutable_data<T>(context.GetPlace());
    auto& dev_ctx = context.template device_context<CUDADeviceContext>();
    Functor functor(dev_ctx);
-    functor(detail::Ref(X), Out);
+    functor(GET_DATA_SAFELY(X, "Input", "X", "CudnnActivation"), Out);
  }
 };

@@ -161,7 +161,10 @@ class CudnnActivationGradKernel
    dX->mutable_data<T>(context.GetPlace());
    auto& dev_ctx = context.template device_context<CUDADeviceContext>();
    Functor functor(dev_ctx);
-    functor(detail::Ref(X), detail::Ref(Out), detail::Ref(dOut), dX);
+    functor(GET_DATA_SAFELY(X, "Input", "X", "CudnnActivationGrad"),
+            GET_DATA_SAFELY(Out, "Input", "Out", "CudnnActivationGrad"),
+            GET_DATA_SAFELY(dOut, "Input", "Out@GRAD", "CudnnActivationGrad"),
+            dX);
  }
 };


--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -26,7 +26,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/float16.h"

@@ -156,8 +155,10 @@ class ActivationKernel
    ExtractActivationTensor(context, &X, &Out);
    Out->mutable_data<T>(context.GetPlace());

-    auto x = framework::EigenVector<T>::Flatten(detail::Ref(X));
-    auto out = framework::EigenVector<T>::Flatten(detail::Ref(Out));
+    auto x = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(X, "Input", "X", "Activation"));
+    auto out = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Output", "Out", "Activation"));
    auto* place =
        context.template device_context<DeviceContext>().eigen_device();
    Functor functor;
@@ -182,10 +183,14 @@ class ActivationGradKernel
    ExtractActivationGradTensor<Functor::FwdDeps()>(context, &X, &Out, &dOut,
                                                    &dX);
    dX->mutable_data<T>(context.GetPlace());
-    auto dout = framework::EigenVector<T>::Flatten(detail::Ref(dOut));
-    auto out = framework::EigenVector<T>::Flatten(detail::Ref(Out));
-    auto dx = framework::EigenVector<T>::Flatten(detail::Ref(dX));
-    auto x = framework::EigenVector<T>::Flatten(detail::Ref(X));
+    auto dout = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(dOut, "Input", "Out@GRAD", "ActivationGrad"));
+    auto out = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "ActivationGrad"));
+    auto dx = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(dX, "Input", "X@GRAD", "ActivationGrad"));
+    auto x = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(X, "Input", "X", "ActivationGrad"));
    auto* place =
        context.template device_context<DeviceContext>().eigen_device();
    Functor functor;
@@ -1285,10 +1290,13 @@ struct ReluGradGradFunctor : public BaseActivationFunctor<T> {
                  framework::Tensor* ddOut, framework::Tensor* dOut,
                  framework::Tensor* dX) const {
    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(detail::Ref(ddX));
-    auto out = framework::EigenVector<T>::Flatten(detail::Ref(Out));
+    auto ddx = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "ReluGradGrad"));
+    auto out = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Output", "Out", "ReluGradGrad"));
    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(detail::Ref(ddOut));
+      auto ddout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ReluGradGrad"));
      ddout.device(*d) = ddx * (out > static_cast<T>(0)).template cast<T>();
    }
  }
@@ -1308,9 +1316,12 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
                  framework::Tensor* dX) const {
    if (ddOut) {
      auto* d = dev.eigen_device();
-      auto ddx = framework::EigenVector<T>::Flatten(detail::Ref(ddX));
-      auto out = framework::EigenVector<T>::Flatten(detail::Ref(Out));
-      auto ddout = framework::EigenVector<T>::Flatten(detail::Ref(ddOut));
+      auto ddx = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad"));
+      auto out = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(Out, "Output", "Out", "LeakyReluGradGrad"));
+      auto ddout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad"));
      ddout.device(*d) = ddx *
                         ((out > static_cast<T>(0)).template cast<T>() +
                          static_cast<T>(alpha) *
@@ -1332,18 +1343,23 @@ struct ELUGradGradFunctor : public BaseActivationFunctor<T> {
                  const framework::Tensor* ddX, framework::Tensor* ddOut,
                  const framework::Tensor* dOut, framework::Tensor* dX) const {
    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(detail::Ref(ddX));
-    auto x = framework::EigenVector<T>::Flatten(detail::Ref(X));
+    auto ddx = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "ELUGradGrad"));
+    auto x = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(X, "Input", "X", "ELUGradGrad"));

    if (dX) {
-      auto dx = framework::EigenVector<T>::Flatten(detail::Ref(dX));
-      auto dout = framework::EigenVector<T>::Flatten(detail::Ref(dOut));
+      auto dx = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dX, "Output", "DX", "ELUGradGrad"));
+      auto dout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Output", "DOut", "ELUGradGrad"));
      dx.device(*d) = ddx * dout * static_cast<T>(alpha) * x.exp() *
                      (x < static_cast<T>(0)).template cast<T>();
    }

    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(detail::Ref(ddOut));
+      auto ddout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ELUGradGrad"));
      ddout.device(*d) = ddx *
                         ((x > static_cast<T>(0)).template cast<T>() +
                          static_cast<T>(alpha) * x.exp() *
@@ -1361,17 +1377,22 @@ struct SqrtGradGradFunctor : public BaseActivationFunctor<T> {
                  const framework::Tensor* ddX, framework::Tensor* ddOut,
                  framework::Tensor* dOut, const framework::Tensor* dX) const {
    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(detail::Ref(ddX));
-    auto out = framework::EigenVector<T>::Flatten(detail::Ref(Out));
+    auto ddx = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "SqrtGradGrad"));
+    auto out = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Output", "Out", "SqrtGradGrad"));
    // sqrt GradGrad: ddy = 0.5 * ddx / y, dy = -1 * dx * ddx
    // calculate dy first, so ddy can inplace ddx
    if (dOut) {
-      auto dx = framework::EigenVector<T>::Flatten(detail::Ref(dX));
-      auto dout = framework::EigenVector<T>::Flatten(detail::Ref(dOut));
+      auto dx = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dX, "Output", "DX", "SqrtGradGrad"));
+      auto dout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Output", "DOut", "SqrtGradGrad"));
      dout.device(*d) = dx * ddx * static_cast<T>(-1) / out;
    }
    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(detail::Ref(ddOut));
+      auto ddout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SqrtGradGrad"));
      ddout.device(*d) = ddx * static_cast<T>(0.5) / out;
    }
  }
@@ -1385,17 +1406,22 @@ struct SquareGradGradFunctor : public BaseActivationFunctor<T> {
                  const framework::Tensor* ddX, framework::Tensor* ddOut,
                  const framework::Tensor* dOut, framework::Tensor* dX) const {
    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(detail::Ref(ddX));
-    auto x = framework::EigenVector<T>::Flatten(detail::Ref(X));
+    auto ddx = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "SquareGradGrad"));
+    auto x = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(X, "Input", "X", "SquareGradGrad"));
    // square GradGrad: ddy=2x*ddx, dx=2dy*ddx
    // calculate dx first, so ddy can inplace ddx
    if (dX) {
-      auto dx = framework::EigenVector<T>::Flatten(detail::Ref(dX));
-      auto dout = framework::EigenVector<T>::Flatten(detail::Ref(dOut));
+      auto dx = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dX, "Output", "DX", "SquareGradGrad"));
+      auto dout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Output", "DOut", "SquareGradGrad"));
      dx.device(*d) = ddx * static_cast<T>(2) * dout;
    }
    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(detail::Ref(ddOut));
+      auto ddout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SquareGradGrad"));
      ddout.device(*d) = ddx * static_cast<T>(2) * x;
    }
  }
@@ -1557,8 +1583,10 @@ class PowKernel : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
    ExtractActivationTensor(context, &X, &Out);
    Out->mutable_data<T>(context.GetPlace());

-    auto x = framework::EigenVector<T>::Flatten(detail::Ref(X));
-    auto out = framework::EigenVector<T>::Flatten(detail::Ref(Out));
+    auto x = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(X, "Input", "X", "Pow"));
+    auto out = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Output", "Out", "Pow"));
    auto* place =
        context.template device_context<DeviceContext>().eigen_device();
    Functor functor;
@@ -1602,10 +1630,14 @@ class PowGradKernel
    ExtractActivationGradTensor<Functor::FwdDeps()>(context, &X, &Out, &dOut,
                                                    &dX);
    dX->mutable_data<T>(context.GetPlace());
-    auto dout = framework::EigenVector<T>::Flatten(detail::Ref(dOut));
-    auto out = framework::EigenVector<T>::Flatten(detail::Ref(Out));
-    auto dx = framework::EigenVector<T>::Flatten(detail::Ref(dX));
-    auto x = framework::EigenVector<T>::Flatten(detail::Ref(X));
+    auto dout = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(dOut, "Input", "Out@GRAD", "PowGrad"));
+    auto out = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "PowGrad"));
+    auto dx = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(dX, "Output", "X@GRAD", "PowGrad"));
+    auto x = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(X, "Input", "X", "PowGrad"));
    auto* place =
        context.template device_context<DeviceContext>().eigen_device();
    Functor functor;

--- a/paddle/fluid/operators/add_position_encoding_op.h
+++ b/paddle/fluid/operators/add_position_encoding_op.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/controlflow/get_places_op.cc
+++ b/paddle/fluid/operators/controlflow/get_places_op.cc
@@ -14,7 +14,6 @@ limitations under the License. */

 #include <thread>  // NOLINT
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/gpu_info.h"
@@ -56,10 +55,9 @@ class GetPlacesOp : public framework::OperatorBase {
                      is_gpu ? "GPU" : "CPU");

    auto out_var_name = Output("Out");
-    auto &places =
-        *(detail::Ref(scope.FindVar(out_var_name),
-                      "Output variable %s cannot be found", out_var_name)
-              .GetMutable<platform::PlaceList>());
+    auto &places = *(GET_DATA_SAFELY(scope.FindVar(out_var_name), "Output",
+                                     "Out", "GetPlaces")
+                         .GetMutable<platform::PlaceList>());
    places.reserve(device_count);
    if (is_gpu) {
      PADDLE_ENFORCE_LE(device_count, CUDADevCount(),

--- a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/array_operator.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/math/math_function.h"

 namespace paddle {

--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -19,7 +19,6 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"

 namespace paddle {
 namespace operators {
@@ -198,23 +197,18 @@ class WhileGradOp : public framework::OperatorBase {
          continue;
        }

-        auto &og_outside =
-            detail::Ref(scope.FindVar(outside_og_name),
-                        "Cannot find Outside Gradient %s", outside_og_name);
-        auto &og_inside =
-            detail::Ref(cur_scope.Var(inside_og_name),
-                        "Cannot find inside gradient %s", inside_og_name);
+        auto &og_outside = *scope.FindVar(outside_og_name);
+        auto &og_inside = *cur_scope.Var(inside_og_name);
        if (og_outside.IsType<framework::LoDTensor>()) {
          auto &outside_tensor = og_outside.Get<framework::LoDTensor>();
-          auto &inside_tensor =
-              detail::Ref(og_inside.GetMutable<framework::LoDTensor>());
+          auto &inside_tensor = *og_inside.GetMutable<framework::LoDTensor>();
          inside_tensor.set_lod(outside_tensor.lod());
          inside_tensor.ShareDataWith(outside_tensor);
        } else if (og_outside.IsType<framework::LoDTensorArray>()) {
          auto outside_array =
              og_outside.GetMutable<framework::LoDTensorArray>();
          auto &inside_array =
-              detail::Ref(og_inside.GetMutable<framework::LoDTensorArray>());
+              *og_inside.GetMutable<framework::LoDTensorArray>();
          inside_array.clear();
          inside_array.resize(outside_array->size());
          VLOG(8) << outside_og_name << " size = " << outside_array->size();

--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
@@ -674,9 +673,8 @@ class GemmConvDoubleGradKernel : public framework::OpKernel<T> {
    Tensor* ddY = ctx.Output<Tensor>("DDOutput");
    Tensor* dW = ctx.Output<Tensor>("DFilter");
    Tensor* dX = ctx.Output<Tensor>("DInput");
-    Tensor W = detail::Ref(ctx.Input<Tensor>("Filter"),
-                           "Cannot find input Filter(%s) in scope)",
-                           ctx.InputNames("Filter")[0]);
+    Tensor W = GET_DATA_SAFELY(ctx.Input<Tensor>("Filter"), "Input", "Filter",
+                               "GemmConvDoubleGrad");
    if (!ddY && !dW && !dX) return;

    const int groups = ctx.Attr<int>("groups");

--- a/paddle/fluid/operators/cum_op.h
+++ b/paddle/fluid/operators/cum_op.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"

 namespace paddle {
 namespace operators {
@@ -29,13 +28,11 @@ class CumKernel : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  using T = typename Functor::ELEMENT_TYPE;

  void Compute(const framework::ExecutionContext& context) const override {
-    auto& X = detail::Ref(context.Input<framework::Tensor>("X"),
-                          "Cannot get input tensor X, variable name = %s",
-                          context.InputName("X"));
+    auto& X = GET_DATA_SAFELY(context.Input<framework::Tensor>("X"), "Input",
+                              "X", "Cum");

-    auto& Out = detail::Ref(context.Output<framework::Tensor>("Out"),
-                            "Cannot get output tensor Out, variable name = %s",
-                            context.OutputName("Out"));
+    auto& Out = GET_DATA_SAFELY(context.Output<framework::Tensor>("Out"),
+                                "Output", "Out", "Cum");
    int axis = context.Attr<int>("axis");
    bool exclusive = context.Attr<bool>("exclusive");
    bool reverse = context.Attr<bool>("reverse");
@@ -46,7 +43,7 @@ class CumKernel : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
    PADDLE_ENFORCE_LT(
        axis, x_dims.size(),
        "axis should be less than the dimensiotn of the input tensor");
-    Out.mutable_data<T>(context.GetPlace());
+    Out.template mutable_data<T>(context.GetPlace());

    int pre = 1;
    int post = 1;

--- a/paddle/fluid/operators/detail/safe_ref.h
+++ b/paddle/fluid/operators/detail/safe_ref.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-/**
- * Get Reference From Pointer with check. The error message is printf format,
- * and passed by `args`
- */
-template <typename T, typename... ARGS>
-inline T& Ref(T* ptr, ARGS&&... args) {
-  PADDLE_ENFORCE_NOT_NULL(ptr, ::paddle::string::Sprintf(args...));
-  return *ptr;
-}
-
-template <typename T, typename... ARGS>
-inline std::vector<std::reference_wrapper<T>> VectorRef(
-    const std::vector<T*>& vec, ARGS&&... args) {
-  std::vector<std::reference_wrapper<T>> result;
-  result.reserve(vec.size());
-  for (auto* ptr : vec) {
-    result.emplace_back(Ref(ptr, args...));
-  }
-  return result;
-}
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
@@ -20,7 +20,6 @@ limitations under the License.*/
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/math_function.h"


--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/math_function.h"


--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/math_function.h"

@@ -293,12 +292,10 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
    auto *scores = context.Input<Tensor>("Scores");
    auto *bbox_deltas = context.Input<Tensor>("BboxDeltas");
    auto *im_info = context.Input<Tensor>("ImInfo");
-    auto anchors = detail::Ref(context.Input<Tensor>("Anchors"),
-                               "Cannot find input Anchors(%s) in scope",
-                               context.InputNames("Anchors")[0]);
-    auto variances = detail::Ref(context.Input<Tensor>("Variances"),
-                                 "Cannot find input Variances(%s) in scope",
-                                 context.InputNames("Variances")[0]);
+    auto anchors = GET_DATA_SAFELY(context.Input<Tensor>("Anchors"), "Input",
+                                   "Anchors", "GenerateProposals");
+    auto variances = GET_DATA_SAFELY(context.Input<Tensor>("Variances"),
+                                     "Input", "Variances", "GenerateProposals");

    auto *rpn_rois = context.Output<LoDTensor>("RpnRois");
    auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs");

--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -20,7 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
@@ -367,12 +366,10 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
    auto *scores = context.Input<Tensor>("Scores");
    auto *bbox_deltas = context.Input<Tensor>("BboxDeltas");
    auto *im_info = context.Input<Tensor>("ImInfo");
-    auto anchors = detail::Ref(context.Input<Tensor>("Anchors"),
-                               "Cannot find input Anchors(%s) in scope",
-                               context.InputNames("Anchors")[0]);
-    auto variances = detail::Ref(context.Input<Tensor>("Variances"),
-                                 "Cannot find input Variances(%s) in scope",
-                                 context.InputNames("Variances")[0]);
+    auto anchors = GET_DATA_SAFELY(context.Input<Tensor>("Anchors"), "Input",
+                                   "Anchors", "GenerateProposals");
+    auto variances = GET_DATA_SAFELY(context.Input<Tensor>("Variances"),
+                                     "Input", "Variances", "GenerateProposals");

    auto *rpn_rois = context.Output<LoDTensor>("RpnRois");
    auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs");

--- a/paddle/fluid/operators/fill_op.h
+++ b/paddle/fluid/operators/fill_op.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"

 namespace paddle {
 namespace operators {
@@ -44,10 +43,8 @@ template <typename T>
 class FillKernel : public framework::OpKernel<T> {
 public:
  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
-    auto &out =
-        detail::Ref(ctx.Output<framework::LoDTensor>("Out"),
-                    "Cannot get output lod tensor Out, variable name = %s",
-                    ctx.OutputName("Out"));
+    auto &out = GET_DATA_SAFELY(ctx.Output<framework::LoDTensor>("Out"),
+                                "Output", "Out", "Fill");
    out.Resize(framework::make_ddim(ctx.Attr<std::vector<int>>("shape")));
    auto dtype =
        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));

--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/compound_functors.h"
 #include "paddle/fluid/operators/math/functors.h"
@@ -383,12 +382,10 @@ template <typename DeviceContext, typename T>
 class FusedElemwiseActivationKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto &in_x = detail::Ref(ctx.Input<framework::Tensor>("X"),
-                             "Cannot get input tensor %s, variable name = %s",
-                             "X", ctx.InputName("X"));
-    auto &in_y = detail::Ref(ctx.Input<framework::Tensor>("Y"),
-                             "Cannot get input tensor %s, variable name = %s",
-                             "Y", ctx.InputName("Y"));
+    auto &in_x = GET_DATA_SAFELY(ctx.Input<framework::Tensor>("X"), "Input",
+                                 "X", "FusedElemwiseActivation");
+    auto &in_y = GET_DATA_SAFELY(ctx.Input<framework::Tensor>("Y"), "Input",
+                                 "Y", "FusedElemwiseActivation");
    PADDLE_ENFORCE(ctx.HasOutput("Out"), "The output(Out) should not be empty");
    auto output = ctx.Output<framework::Tensor>("Out");


--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc
@@ -14,7 +14,6 @@ limitations under the License. */

 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/platform/errors.h"

 namespace paddle {

--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
@@ -19,7 +19,6 @@
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
 #include "paddle/fluid/operators/math/blas.h"


--- a/paddle/fluid/operators/fused/multihead_matmul_op.cc
+++ b/paddle/fluid/operators/fused/multihead_matmul_op.cc
@@ -14,7 +14,6 @@ limitations under the License. */

 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/platform/errors.h"

 namespace paddle {

--- a/paddle/fluid/operators/fused/multihead_matmul_op.cu
+++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu
@@ -17,7 +17,6 @@
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
 #include "paddle/fluid/operators/math/blas.h"

@@ -142,14 +141,13 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
    auto *input = context.Input<framework::Tensor>("Input");
    auto *w = context.Input<framework::Tensor>("W");
    auto *bias = context.Input<framework::Tensor>("Bias");
-
-    auto &bias_qk = detail::Ref(context.Input<framework::Tensor>("BiasQK"),
-                                "Cannot find QK");
+    auto &bias_qk = GET_DATA_SAFELY(context.Input<framework::Tensor>("BiasQK"),
+                                    "Input", "BiasQK", "MultiHeadMatMulV2");

    auto *input_d = input->data<T>();
    auto *w_d = w->data<T>();
    auto *bias_d = bias->data<T>();
-    auto *bias_qk_d = bias_qk.data<T>();
+    auto *bias_qk_d = bias_qk.template data<T>();
    T scale = static_cast<T>(context.Attr<float>("alpha"));

    int head_number = context.Attr<int>("head_number");

--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/clip_op.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/matrix_bit_code.h"
 #include "paddle/fluid/platform/transform.h"
@@ -40,8 +39,9 @@ template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 using platform::Transform;
+using framework::LoDTensor;

-static std::vector<int64_t> PathToRows(const framework::LoDTensor& path) {
+static std::vector<int64_t> PathToRows(const LoDTensor& path) {
  std::set<int64_t> rows;
  const int64_t* paths = path.data<int64_t>();
  for (int64_t i = 0; i < path.numel(); ++i) {
@@ -57,14 +57,17 @@ template <typename DeviceContext, typename T>
 class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& in = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
-    auto& w = detail::Ref(ctx.Input<framework::LoDTensor>("W"));
-    auto* path = ctx.Input<framework::LoDTensor>("PathTable");
-    auto* code = ctx.Input<framework::LoDTensor>("PathCode");
-    auto& label = detail::Ref(ctx.Input<framework::LoDTensor>("Label"));
-    auto* bias = ctx.Input<framework::LoDTensor>("Bias");
-    auto* out = ctx.Output<framework::LoDTensor>("Out");
-    auto* pre_out = ctx.Output<framework::LoDTensor>("PreOut");
+    auto& in = GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X",
+                               "HierarchicalSigmoid");
+    auto& w = GET_DATA_SAFELY(ctx.Input<LoDTensor>("W"), "Input", "W",
+                              "HierarchicalSigmoid");
+    auto* path = ctx.Input<LoDTensor>("PathTable");
+    auto* code = ctx.Input<LoDTensor>("PathCode");
+    auto& label = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Label"), "Input",
+                                  "Label", "HierarchicalSigmoid");
+    auto* bias = ctx.Input<LoDTensor>("Bias");
+    auto* out = ctx.Output<LoDTensor>("Out");
+    auto* pre_out = ctx.Output<LoDTensor>("PreOut");
    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
    // for remote prefetch

@@ -75,7 +78,7 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
    int64_t code_length =
        path ? path->dims()[1] : math::FindLastSet(num_classes - 1);
    int64_t batch_size = in.dims()[0];
-    framework::LoDTensor sum;
+    LoDTensor sum;
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    auto* pre_out_data = pre_out->mutable_data<T>(
        framework::make_ddim({batch_size, code_length}), ctx.GetPlace());
@@ -89,11 +92,11 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {

    std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
    if (!is_custom) {
-      bit_code.reset(new math::MatrixBitCodeFunctor<T>(num_classes,
-                                                       label.data<int64_t>()));
+      bit_code.reset(new math::MatrixBitCodeFunctor<T>(
+          num_classes, label.template data<int64_t>()));
    } else {
-      bit_code.reset(new math::MatrixBitCodeFunctor<T>(*path, *code,
-                                                       label.data<int64_t>()));
+      bit_code.reset(new math::MatrixBitCodeFunctor<T>(
+          *path, *code, label.template data<int64_t>()));
    }

    std::vector<int64_t> sum_dims({batch_size, 1UL});
@@ -126,20 +129,24 @@ template <typename DeviceContext, typename T>
 class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& in = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
-    auto& w = detail::Ref(ctx.Input<framework::LoDTensor>("W"));
-    auto* path = ctx.Input<framework::LoDTensor>("PathTable");
-    auto* code = ctx.Input<framework::LoDTensor>("PathCode");
-    auto* in_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto& in = GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X",
+                               "HierarchicalSigmoidGrad");
+    auto& w = GET_DATA_SAFELY(ctx.Input<LoDTensor>("W"), "Input", "W",
+                              "HierarchicalSigmoidGrad");
+    auto* path = ctx.Input<LoDTensor>("PathTable");
+    auto* code = ctx.Input<LoDTensor>("PathCode");
+    auto* in_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
    bool is_sparse = ctx.Attr<bool>("is_sparse");
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    math::SetConstant<DeviceContext, T> zero;
-    auto& label = detail::Ref(ctx.Input<framework::LoDTensor>("Label"));
-    auto& pre_out = detail::Ref(ctx.Input<framework::LoDTensor>("PreOut"));
-    auto& out_grad = detail::Ref(
-        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out")));
-    framework::LoDTensor pre_out_grad;
+    auto& label = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Label"), "Input",
+                                  "Label", "HierarchicalSigmoidGrad");
+    auto& pre_out = GET_DATA_SAFELY(ctx.Input<LoDTensor>("PreOut"), "Input",
+                                    "PreOut", "HierarchicalSigmoidGrad");
+    auto& out_grad = GET_DATA_SAFELY(
+        ctx.Input<LoDTensor>(framework::GradVarName("Out")), "Input",
+        framework::GradVarName("Out"), "HierarchicalSigmoidGrad");
+    LoDTensor pre_out_grad;

    pre_out_grad.mutable_data<T>(pre_out.dims(), ctx.GetPlace());
    in_grad->mutable_data<T>(ctx.GetPlace());
@@ -154,11 +161,11 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {

    std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
    if (!is_custom) {
-      bit_code.reset(new math::MatrixBitCodeFunctor<T>(num_classes,
-                                                       label.data<int64_t>()));
+      bit_code.reset(new math::MatrixBitCodeFunctor<T>(
+          num_classes, label.template data<int64_t>()));
    } else {
-      bit_code.reset(new math::MatrixBitCodeFunctor<T>(*path, *code,
-                                                       label.data<int64_t>()));
+      bit_code.reset(new math::MatrixBitCodeFunctor<T>(
+          *path, *code, label.template data<int64_t>()));
    }

    // softrelu derivative
@@ -166,7 +173,7 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
    auto blas = math::GetBlas<DeviceContext, T>(ctx);

    auto* pre_out_grad_data = pre_out_grad.data<T>();
-    auto* pre_out_data = pre_out.data<T>();
+    auto* pre_out_data = pre_out.template data<T>();
    auto n = pre_out.numel();
    blas.VEXP(n, pre_out_data, pre_out_grad_data);
    blas.VINV(n, pre_out_grad_data, pre_out_grad_data);
@@ -174,7 +181,7 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
      pre_out_grad_data[i] = 1.0 - pre_out_grad_data[i];
    }
    bit_code->Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
-    auto* out_grad_data = out_grad.data<T>();
+    auto* out_grad_data = out_grad.template data<T>();

    int64_t dim0 = pre_out_grad.dims()[0];
    int64_t dim1 = pre_out_grad.dims()[1];
@@ -184,16 +191,14 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
    }
    // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
    // be consistent with the clipping in forward.
-    auto* bias_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Bias"));
+    auto* bias_grad = ctx.Output<LoDTensor>(framework::GradVarName("Bias"));
    if (bias_grad) {
      bias_grad->mutable_data<T>(ctx.GetPlace());
      zero(dev_ctx, bias_grad, static_cast<T>(0.0));
      bit_code->AddGrad(pre_out_grad, bias_grad);
    }
    if (!is_sparse) {
-      auto* w_grad =
-          ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
+      auto* w_grad = ctx.Output<LoDTensor>(framework::GradVarName("W"));
      w_grad->mutable_data<T>(ctx.GetPlace());
      zero(dev_ctx, w_grad, static_cast<T>(0.0));
      bit_code->MulGradWeight(pre_out_grad, w_grad, in);

--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/port.h"
@@ -95,13 +94,15 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
 private:
  void RunImpl(const framework::Scope &scope,
               const platform::Place &place) const override {
-    auto &x = detail::Ref(scope.FindVar(Input("X")), "Cannot find input %s",
-                          Input("X"))
+    auto &x = GET_DATA_SAFELY(scope.FindVar(Input("X")), "Input", "X",
+                              "LoDTensorToArray")
                  .Get<framework::LoDTensor>();
-    auto &rank_table = detail::Ref(scope.FindVar(Input("RankTable")))
+    auto &rank_table = GET_DATA_SAFELY(scope.FindVar(Input("RankTable")),
+                                       "Input", "RankTable", "LoDTensorToArray")
                           .Get<framework::LoDRankTable>();
-    auto &out = *detail::Ref(scope.FindVar(Output("Out")))
-                     .GetMutable<framework::LoDTensorArray>();
+    auto &out = *(GET_DATA_SAFELY(scope.FindVar(Output("Out")), "Output", "Out",
+                                  "LoDTensorToArray")
+                      .GetMutable<framework::LoDTensorArray>());
    auto &items = rank_table.items();
    auto max_seq_len = items[0].length;
    auto rank_level = rank_table.level();

--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/math/blas.h"

 namespace paddle {
@@ -58,10 +57,10 @@ template <typename DeviceContext, typename T>
 class MatMulKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
-    auto &x =
-        detail::Ref(context.Input<framework::Tensor>("X"), "Cannot find X");
-    auto &y =
-        detail::Ref(context.Input<framework::Tensor>("Y"), "Cannot find Y");
+    auto &x = GET_DATA_SAFELY(context.Input<framework::Tensor>("X"), "Input",
+                              "X", "MatMul");
+    auto &y = GET_DATA_SAFELY(context.Input<framework::Tensor>("Y"), "Input",
+                              "Y", "MatMul");
    auto *out = context.Output<framework::Tensor>("Out");
    out->mutable_data<T>(context.GetPlace());


--- a/paddle/fluid/operators/mkldnn/mkldnn_activation_op.h
+++ b/paddle/fluid/operators/mkldnn/mkldnn_activation_op.h
@@ -17,7 +17,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"

 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"

--- a/paddle/fluid/operators/optimizers/adam_op.cu
+++ b/paddle/fluid/operators/optimizers/adam_op.cu
@@ -128,7 +128,6 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
                          framework::ToTypeName(param_var->Type())));

    using paddle::framework::LoDTensor;
-    using paddle::operators::detail::Ref;

    int64_t min_row_size_to_use_multithread =
        ctx.Attr<int64_t>("min_row_size_to_use_multithread");

--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/math/algorithm.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/for_range.h"
@@ -384,7 +383,6 @@ class AdamOpKernel : public framework::OpKernel<T> {
                   framework::ToTypeName(param_var->Type()));

    using paddle::framework::LoDTensor;
-    using paddle::operators::detail::Ref;

    int64_t min_row_size_to_use_multithread =
        ctx.Attr<int64_t>("min_row_size_to_use_multithread");

--- a/paddle/fluid/operators/optimizers/lamb_op.h
+++ b/paddle/fluid/operators/optimizers/lamb_op.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <Eigen/Dense>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/math/algorithm.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/for_range.h"
@@ -185,30 +184,32 @@ class LambOpKernel : public framework::OpKernel<T> {
                   framework::ToTypeName(param_var->Type()));

    using paddle::framework::LoDTensor;
-    using paddle::operators::detail::Ref;

    T weight_decay = static_cast<T>(ctx.Attr<float>("weight_decay"));
    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-    auto& param = Ref(ctx.Input<LoDTensor>("Param"), "Must set Param.");
+    auto& param = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Param"), "Input",
+                                  "Param", "Lamb");
    auto* grad_var = ctx.InputVar("Grad");
-    auto& mom1 = Ref(ctx.Input<LoDTensor>("Moment1"), "Must set Moment1.");
-    auto& mom2 = Ref(ctx.Input<LoDTensor>("Moment2"), "Must set Moment2.");
-    auto& lr =
-        Ref(ctx.Input<LoDTensor>("LearningRate"), "Must set LearningRate.");
-
-    auto& beta1_pow =
-        Ref(ctx.Input<LoDTensor>("Beta1Pow"), "Must set Beta1Pow.");
-    auto& beta2_pow =
-        Ref(ctx.Input<LoDTensor>("Beta2Pow"), "Must set Beta2Pow.");
-
-    auto& param_out =
-        Ref(ctx.Output<LoDTensor>("ParamOut"), "Must set ParamOut.");
-    auto& mom1_out =
-        Ref(ctx.Output<LoDTensor>("Moment1Out"), "Must set Moment1Out.");
-    auto& mom2_out =
-        Ref(ctx.Output<LoDTensor>("Moment2Out"), "Must set Moment1Out.");
+    auto& mom1 = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Moment1"), "Input",
+                                 "Moment1", "Lamb");
+    auto& mom2 = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Moment2"), "Input",
+                                 "Moment2", "Lamb");
+    auto& lr = GET_DATA_SAFELY(ctx.Input<LoDTensor>("LearningRate"), "Input",
+                               "LearningRate", "Lamb");
+
+    auto& beta1_pow = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Beta1Pow"), "Input",
+                                      "Beta1Pow", "Lamb");
+    auto& beta2_pow = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Beta2Pow"), "Input",
+                                      "Beta2Pow", "Lamb");
+
+    auto& param_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("ParamOut"),
+                                      "Output", "ParamOut", "Lamb");
+    auto& mom1_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("Moment1Out"),
+                                     "Output", "Moment1Out", "Lamb");
+    auto& mom2_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("Moment2Out"),
+                                     "Output", "Moment2Out", "Lamb");

    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    platform::ForRange<DeviceContext> for_range(dev_ctx, param.numel());
@@ -217,7 +218,7 @@ class LambOpKernel : public framework::OpKernel<T> {

    // Update moments
    if (grad_var->IsType<framework::LoDTensor>()) {
-      auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad.");
+      auto& grad = *ctx.Input<LoDTensor>("Grad");

      LambMomentUpdateFunctor<T> moment_update_functor(
          weight_decay, beta1, beta2, epsilon, beta1_pow.template data<T>(),
@@ -229,8 +230,8 @@ class LambOpKernel : public framework::OpKernel<T> {
          trust_ratio_div.template data<T>());
      for_range(moment_update_functor);
    } else if (grad_var->IsType<framework::SelectedRows>()) {
-      auto& grad =
-          Ref(ctx.Input<framework::SelectedRows>("Grad"), "Must set Grad.");
+      auto& grad = GET_DATA_SAFELY(ctx.Input<framework::SelectedRows>("Grad"),
+                                   "Input", "Grad", "Lamb");
      if (grad.rows().size() == 0) {
        VLOG(3) << "grad row size is 0!!";
        return;

--- a/paddle/fluid/operators/random_crop_op.h
+++ b/paddle/fluid/operators/random_crop_op.h
@@ -16,7 +16,6 @@

 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/for_range.h"
 #ifdef PADDLE_WITH_CUDA
@@ -152,10 +151,11 @@ class RandomCropKernel : public framework::OpKernel<T> {
 public:
  virtual void Compute(const framework::ExecutionContext& ctx) const {
    int64_t seed = 0;
-    auto& seed_tensor = detail::Ref(ctx.Input<framework::LoDTensor>("Seed"));
+    auto& seed_tensor = GET_DATA_SAFELY(ctx.Input<framework::LoDTensor>("Seed"),
+                                        "Input", "Seed", "RandomCrop");
    if (seed_tensor.IsInitialized()) {
      if (platform::is_cpu_place(seed_tensor.place())) {
-        seed = *seed_tensor.data<int64_t>();
+        seed = *seed_tensor.template data<int64_t>();
      } else {
        LOG(WARNING) << "It is slow to place seed in GPU memory. Please verify "
                        "your program";
@@ -169,13 +169,15 @@ class RandomCropKernel : public framework::OpKernel<T> {
      seed = ctx.Attr<int>("startup_seed");
    }
    auto shape = ctx.Attr<std::vector<int>>("shape");
-    auto& x = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
-    auto& out = detail::Ref(ctx.Output<framework::LoDTensor>("Out"));
+    auto& x = GET_DATA_SAFELY(ctx.Input<framework::LoDTensor>("X"), "Input",
+                              "X", "RandomCrop");
+    auto& out = GET_DATA_SAFELY(ctx.Output<framework::LoDTensor>("Out"),
+                                "Output", "Out", "RandomCrop");

    int num_batchsize_dims = x.dims().size() - shape.size();
    RandomCropFunctor<DeviceContext, T> functor(
-        x.data<T>(), out.mutable_data<T>(ctx.GetPlace()), x.dims(), out.dims(),
-        num_batchsize_dims, seed);
+        x.template data<T>(), out.template mutable_data<T>(ctx.GetPlace()),
+        x.dims(), out.dims(), num_batchsize_dims, seed);
    platform::ForRange<DeviceContext> for_range(
        ctx.template device_context<DeviceContext>(),
        functor.prod_batchsize_dims_);

--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -13,7 +13,6 @@
 // limitations under the License.

 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"

 namespace paddle {
@@ -171,8 +170,11 @@ void CustomReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
  // 3. Copy LoDTensors from sink variables to out.
  out->resize(sink_var_names_.size());
  for (size_t i = 0; i < sink_var_names_.size(); ++i) {
-    const auto& tensor = detail::Ref(exe_scope->FindVar(sink_var_names_[i]))
-                             .Get<framework::LoDTensor>();
+    auto* var = exe_scope->FindVar(sink_var_names_[i]);
+    PADDLE_ENFORCE_NOT_NULL(var, platform::errors::NotFound(
+                                     "The variable %s is not in current scope.",
+                                     sink_var_names_[i]));
+    const auto& tensor = var->Get<framework::LoDTensor>();
    framework::TensorCopySync(tensor, platform::CPUPlace(), &(*out)[i]);
  }
  scope_.DeleteScope(exe_scope);

--- a/paddle/fluid/operators/reader/read_op.cc
+++ b/paddle/fluid/operators/reader/read_op.cc
@@ -15,7 +15,6 @@
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/platform/profiler.h"

 namespace paddle {
@@ -96,8 +95,8 @@ class ReadOp : public framework::OperatorBase {
               const platform::Place& dev_place) const override {
    VLOG(3) << "read op in";
    framework::ReaderHolder* reader =
-        detail::Ref(scope.FindVar(Input("Reader")),
-                    "Cannot find reader variable %s", Input("Reader"))
+        GET_DATA_SAFELY(scope.FindVar(Input("Reader")), "Input", "Reader",
+                        "Read")
            .GetMutable<framework::ReaderHolder>();
    std::vector<std::string> out_arg_names = Outputs("Out");
    std::vector<framework::LoDTensor> ins;

--- a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
@@ -14,7 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/platform/device_context.h"

 namespace paddle {
@@ -78,18 +77,16 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
 private:
  void RunImpl(const framework::Scope &scope,
               const platform::Place &place) const override {
-    auto &x =
-        detail::Ref(scope.FindVar(Input("X")),
-                    "Cannot find input lod tensor variable %s", Input("X"))
-            .Get<framework::LoDTensor>();
-    auto &rank_table = detail::Ref(scope.FindVar(Input("RankTable")),
-                                   "Cannot find input rank table variable %s",
-                                   Input("RankTable"))
-                           .Get<framework::LoDRankTable>();
-    auto &out =
-        *detail::Ref(scope.FindVar(Output("Out")),
-                     "Cannot find output lod tensor variable %s", Output("Out"))
-             .GetMutable<framework::LoDTensor>();
+    auto &x = GET_DATA_SAFELY(scope.FindVar(Input("X")), "Input", "X",
+                              "ReorderLoDTensorByRankTable")
+                  .Get<framework::LoDTensor>();
+    auto &rank_table =
+        GET_DATA_SAFELY(scope.FindVar(Input("RankTable")), "Input", "RankTable",
+                        "ReorderLoDTensorByRankTable")
+            .Get<framework::LoDRankTable>();
+    auto &out = *(GET_DATA_SAFELY(scope.FindVar(Output("Out")), "Output", "Out",
+                                  "ReorderLoDTensorByRankTable")
+                      .GetMutable<framework::LoDTensor>());

    out.Resize(x.dims());
    out.mutable_data(x.place(), x.type());

--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 #include <memory>
 #include <string>

-#include "paddle/fluid/operators/detail/safe_ref.h"
-
 namespace paddle {
 namespace operators {


--- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.h
@@ -18,7 +18,6 @@
 #include <vector>
 #include "boost/optional.hpp"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"

 namespace paddle {
@@ -47,16 +46,28 @@ inline framework::LoD ConcatLoD(const Container &xs,
  lod.emplace_back(result);
  return lod;
 }
+
+template <typename T, typename... ARGS>
+inline std::vector<std::reference_wrapper<T>> GetDataVectorSafely(
+    const std::vector<T *> &vec, ARGS &&... args) {
+  std::vector<std::reference_wrapper<T>> result;
+  result.reserve(vec.size());
+  for (auto *ptr : vec) {
+    PADDLE_ENFORCE_NOT_NULL(ptr, platform::errors::InvalidArgument(
+                                     "The input variable X contains nullptr."));
+    result.emplace_back(*ptr);
+  }
+  return result;
+}
 }  // namespace detail

 template <typename DeviceContext, typename T>
 class SeqConcatKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
-    auto xs = detail::VectorRef(context.MultiInput<framework::LoDTensor>("X"),
-                                "Cannot find multiple input X");
-    auto &out = detail::Ref(context.Output<framework::LoDTensor>("Out"),
-                            "Cannot find output");
+    auto xs = detail::GetDataVectorSafely(
+        context.MultiInput<framework::LoDTensor>("X"));
+    auto &out = *context.Output<framework::LoDTensor>("Out");

    size_t lod_size = 0;
    for (auto &x : xs) {
@@ -141,9 +152,9 @@ class SeqConcatGradKernel : public framework::OpKernel<T> {

    math::SplitFunctor<DeviceContext, T> functor;
    functor(context.template device_context<DeviceContext>(),
-            detail::Ref(
+            GET_DATA_SAFELY(
                context.Input<framework::Tensor>(framework::GradVarName("Out")),
-                "Sequence Concat OG must be set"),
+                "Input", framework::GradVarName("Out"), "SeqConcatGrad"),
            sliced_x_ptr, 0, &sliced_dx_ptr);
  }
 };

--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <vector>

 #include "paddle/fluid/framework/var_type_inference.h"
-#include "paddle/fluid/operators/detail/safe_ref.h"

 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -350,6 +350,51 @@ struct EnforceNotMet : public std::exception {

 /** EXTENDED TOOL FUNCTIONS WITH CHECKING **/

+/*
+ * Summary: This macro is used to get Variable or internal type
+ *   data (such as LoDTensor or SelectedRows) of the Input and
+ *   Output in op, generally used when call scope.FindVar(Input/
+ *   Output("Name")) or ctx.Input<LoDTensor>().
+ *   Firstly this macro check whether the obtained pointer is null,
+ *   and then return data if it is not null.
+ *
+ * Note: This macro is only suitable for specific scenarios and
+ *   does not intended to be widely used. If it cannot meet the
+ *   requirements, please use other PADDLE_ENFORCE** check macro.
+ *
+ * Parameters:
+ *     __PTR: pointer
+ *     __ROLE: (string), Input or Output
+ *     __NAME: (string), Input or Output name
+ *     __OP_TYPE: (string), the op type
+ *  
+ * Return: The data pointed to by the pointer.
+ *
+ * Examples:
+ *    GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X", "Mul");
+*/
+#define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE)                   \
+  (([&]() -> std::add_lvalue_reference<decltype(*(__PTR))>::type {          \
+    auto* ptr = (__PTR);                                                    \
+    if (UNLIKELY(nullptr == ptr)) {                                         \
+      __THROW_ERROR_INTERNAL__(                                             \
+          "%s\n  [Hint: pointer " #__PTR " should not be null.]",           \
+          paddle::platform::errors::NotFound(                               \
+              "Unable to get %s data of %s %s in operator %s. "             \
+              "Possible reasons are:\n"                                     \
+              "  1. The %s is not the %s of operator %s;\n"                 \
+              "  2. The %s has no corresponding variable passed in;\n"      \
+              "  3. The %s corresponding variable is not initialized.",     \
+              paddle::platform::demangle(                                   \
+                  typeid(std::add_lvalue_reference<decltype(*ptr)>::type)   \
+                      .name()),                                             \
+              __ROLE, __NAME, __OP_TYPE, __NAME, __ROLE, __OP_TYPE, __NAME, \
+              __NAME)                                                       \
+              .ToString());                                                 \
+    }                                                                       \
+    return *ptr;                                                            \
+  })())
+
 /*
 * Summary: This macro is used to check whether op has specified
 * Input or Output Variables. Because op's Input and Output

--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -362,6 +362,22 @@ TEST(enforce, cannot_to_string_type) {
  PADDLE_ENFORCE_NE(list.begin(), list.end());
 }

+TEST(GET_DATA_SAFELY_MACRO, SUCCESS) {
+  int* a = new int(10);
+  GET_DATA_SAFELY(a, "Input", "X", "dummy");
+}
+
+TEST(GET_DATA_SAFELY_MACRO, FAIL) {
+  bool caught_exception = false;
+  try {
+    int* a = nullptr;
+    GET_DATA_SAFELY(a, "Input", "X", "dummy");
+  } catch (paddle::platform::EnforceNotMet& error) {
+    caught_exception = true;
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
 TEST(OP_INOUT_CHECK_MACRO, SUCCESS) {
  OP_INOUT_CHECK(true, "Input", "X", "dummy");
 }