diff --git a/paddle/fluid/operators/center_loss_op.cu b/paddle/fluid/operators/center_loss_op.cu
index 10b65fa215adc51ecc5c4ff482803a4c8379a757..f15d1fe5e02ac2c168870b525b390355bdcb981b 100644
--- a/paddle/fluid/operators/center_loss_op.cu
+++ b/paddle/fluid/operators/center_loss_op.cu
@@ -30,8 +30,10 @@ __global__ void ComputeDifferent(T *centers_diff, const T *X, const T *centers,
 
   while (idy < K) {
     int64_t id = ids[idy];
-    PADDLE_ENFORCE(id >= 0, "received id:", id);
-    PADDLE_ENFORCE(id < N, "received id:", id);
+    PADDLE_ENFORCE(id >= 0, "Id should larger than 0 but received id: %d.", id);
+    PADDLE_ENFORCE(id < N, "Id should smaller than %d but received id: %d.", N,
+                   id);
+
     T *out = centers_diff + idy * D;
     const T *x = X + idy * D;
     const T *cent = centers + id * D;
@@ -52,8 +54,9 @@ __global__ void UpdateCenters(T *centers, T *centers_diff, const int64_t *ids,
   while (idy < K) {
     int count = 1;
     int64_t id = ids[idy];
-    PADDLE_ENFORCE(id >= 0, "received id:", id);
-    PADDLE_ENFORCE(id < N, "received id:", id);
+    PADDLE_ENFORCE(id >= 0, "Id should larger than 0 but received id: %d.", id);
+    PADDLE_ENFORCE(id < N, "Id should smaller than %d but received id: %d.", N,
+                   id);
 
     for (int i = 0; i < K; i++) {
       if (ids[i] == id) {
diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu
index 44a7c16f96a5e6298e4b5602252698198c726c8f..67bd71d4a1be34dc5f5786bd863930329c92a046 100644
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
@@ -69,8 +69,10 @@ template <typename T>
 class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
-                   "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
+                      platform::errors::InvalidArgument(
+                          "CTCAlign operator CUDA kernel must use CUDAPlace "
+                          "rather than CPUPlace."));
     auto* input = ctx.Input<LoDTensor>("Input");
     auto* output = ctx.Output<LoDTensor>("Output");
     const int blank = ctx.Attr<int>("blank");
diff --git a/paddle/fluid/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h
index ccf91471ab99ced7d2731f877f14ecb8deb437ea..662f899c0a59337266ebcff1bbc1a6b3ee231605 100644
--- a/paddle/fluid/operators/ctc_align_op.h
+++ b/paddle/fluid/operators/ctc_align_op.h
@@ -72,8 +72,11 @@ class CTCAlignKernel : public framework::OpKernel<T> {
       // check input dims and lod
       PADDLE_ENFORCE_EQ(
           input_dims[0], static_cast<int64_t>(input_lod[level].back()),
-          "The first dimension of Input(Input) should be equal to "
-          "the sum of all sequences' lengths.");
+          platform::errors::InvalidArgument(
+              "The first dimension %d of CTCAlign operator Input(Input) should "
+              "be equal to "
+              "the sum of all sequences' lengths %d.",
+              input_dims[0], static_cast<int64_t>(input_lod[level].back())));
 
       const size_t num_sequences = input_lod[level].size() - 1;
 
diff --git a/paddle/fluid/operators/mv_op.cc b/paddle/fluid/operators/mv_op.cc
index 1339982adaab162056bdefd3eecb405e95188a0d..cce066ec40425ee7db34bea92564e84a0daaf101 100644
--- a/paddle/fluid/operators/mv_op.cc
+++ b/paddle/fluid/operators/mv_op.cc
@@ -42,21 +42,21 @@ class MVOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "mv");
 
     auto dim_x = context->GetInputDim("X");
-    auto dim_y = context->GetInputDim("Vec");
+    auto dim_vec = context->GetInputDim("Vec");
     PADDLE_ENFORCE_EQ(
         dim_x.size(), 2,
         platform::errors::InvalidArgument(
             "The rank of input X should be 2, but is %d", dim_x.size()));
     PADDLE_ENFORCE_EQ(
-        dim_y.size(), 1,
+        dim_vec.size(), 1,
         platform::errors::InvalidArgument(
-            "The rank of input Vec should be 1, but is %d", dim_y.size()));
-    PADDLE_ENFORCE_EQ(dim_x[1] == dim_y[0], true,
+            "The rank of input Vec should be 1, but is %d", dim_vec.size()));
+    PADDLE_ENFORCE_EQ(dim_x[1], dim_vec[0],
                       platform::errors::InvalidArgument(
-                          "The length of input X' second dim should equal the "
-                          "length of input Vec,"
-                          " but X[%d, %d], Vec[%d]",
-                          dim_x[0], dim_x[1], dim_y[0]));
+                          "X's second dimension is expected to be equal to "
+                          "Vec's first dimension"
+                          "but recieved X'shape = [%s], Vec's shape = [%s]",
+                          dim_x, dim_vec));
 
     framework::DDim dim_out = framework::make_ddim({dim_x[0]});
 
diff --git a/paddle/fluid/operators/mv_op.cu b/paddle/fluid/operators/mv_op.cu
index 9a16fe025cd71457faade38f92f56e56c26b3b32..b6d829392e3ba16c897c0565502ffb777604fb53 100644
--- a/paddle/fluid/operators/mv_op.cu
+++ b/paddle/fluid/operators/mv_op.cu
@@ -19,8 +19,8 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-__global__ void MVGradCUDAKernel(const int m, const int n, const T *dout,
-                                 const T *vec, T *dx) {
+__global__ void MVGradDxCUDAKernel(const int m, const int n, const T *dout,
+                                   const T *vec, T *dx) {
   int idx = blockDim.x * blockIdx.x + threadIdx.x;
   for (; idx < m * n; idx += blockDim.x * gridDim.x) {
     int i = idx / n;
@@ -52,32 +52,31 @@ class MVGradKernel<platform::CUDADeviceContext, T>
     int m = dim_x[0];
     int n = dim_x[1];
 
-    dx->Resize(framework::make_ddim({m * n}));
-
     // get data ptr
     const T *x_data = x->data<T>();
     const T *vec_data = vec->data<T>();
     const T *dout_data = dout->data<T>();
 
-    T *dx_data = dx->mutable_data<T>(context.GetPlace());
-    T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
-
     auto &dev_ctx =
         context.template device_context<platform::CUDADeviceContext>();
     auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
-
-    // calculate dx
     auto stream = context.cuda_device_context().stream();
     auto config = GetGpuLaunchConfig1D(dev_ctx, m * n);
-    MVGradCUDAKernel<
-        T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
-        m, n, dout_data, vec_data, dx_data);
 
-    dx->Resize(framework::make_ddim({m, n}));
+    if (dx) {
+      T *dx_data = dx->mutable_data<T>(context.GetPlace());
+
+      MVGradDxCUDAKernel<
+          T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
+          m, n, dout_data, vec_data, dx_data);
+    }
+
+    if (dvec) {
+      T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
 
-    // calculate dvec
-    blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
-              static_cast<T>(0), dvec_data);
+      blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
+                static_cast<T>(0), dvec_data);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/mv_op.h b/paddle/fluid/operators/mv_op.h
index 3c63f3640ff46f5592a244a930a191a23959baf7..e29449962989f8310391a6a0ae94c1d06cb2dbf0 100644
--- a/paddle/fluid/operators/mv_op.h
+++ b/paddle/fluid/operators/mv_op.h
@@ -74,30 +74,30 @@ class MVGradKernel : public framework::OpKernel<T> {
     int m = dim_x[0];
     int n = dim_x[1];
 
-    dx->Resize(framework::make_ddim({m * n}));
-
     // get data ptr
     const T *x_data = x->data<T>();
     const T *vec_data = vec->data<T>();
     const T *dout_data = dout->data<T>();
 
-    T *dx_data = dx->mutable_data<T>(context.GetPlace());
-    T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
-
-    auto &dev_ctx = context.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    if (dx) {
+      T *dx_data = dx->mutable_data<T>(context.GetPlace());
 
-    // calculate dx
-    for (int i = 0; i < m; ++i) {
-      for (int j = 0; j < n; ++j)
-        dx_data[i * n + j] = dout_data[i] * vec_data[j];
+      for (int i = 0; i < m; ++i) {
+        for (int j = 0; j < n; ++j) {
+          dx_data[i * n + j] = dout_data[i] * vec_data[j];
+        }
+      }
     }
 
-    dx->Resize(framework::make_ddim({m, n}));
+    if (dvec) {
+      T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
+
+      auto &dev_ctx = context.template device_context<DeviceContext>();
+      auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
 
-    // calculate dvec
-    blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
-              static_cast<T>(0), dvec_data);
+      blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
+                static_cast<T>(0), dvec_data);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index 9317a01833307af6be88692ccfdae362d943c56f..3dc184facc78bc42f0dc45dc9ebd9c099e309930 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -45,8 +45,10 @@ template <typename T>
 class PoolCUDNNOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::InvalidArgument("Pool operator CUDA kernel must use "
+                                          "CUDAPlace rather than CPUPlace."));
 
     const Tensor *input = ctx.Input<Tensor>("X");
     Tensor *output = ctx.Output<Tensor>("Out");
@@ -175,8 +177,10 @@ template <typename T>
 class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                      "It must use CUDAPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::InvalidArgument("Pool operator CUDA kernel must use "
+                                          "CUDAPlace rather than CPUPlace."));
 
     const Tensor *input = ctx.Input<Tensor>("X");
     const Tensor *output = ctx.Input<Tensor>("Out");
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index 9900120e6c590f5d0c454fdba3ee4e936c2c409b..ba468b79605575b8957e1e963dabad9dff957eb3 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -38,18 +38,22 @@ int PoolOutputSize(int input_size, int filter_size, int padding_1,
   }
   PADDLE_ENFORCE_GT(
       output_size, 0,
-      "ShapeError: the output size must be greater than 0. But received: "
-      "output_size = %d due to the settings of input_size(%d), padding(%d,%d), "
-      "k_size(%d) and stride(%d). Please check again!",
-      output_size, input_size, padding_1, padding_2, filter_size, stride);
+      platform::errors::InvalidArgument(
+          "the output size must be greater than 0. But received: "
+          "output_size = %d due to the settings of input_size(%d), "
+          "padding(%d,%d), "
+          "k_size(%d) and stride(%d). Please check again!",
+          output_size, input_size, padding_1, padding_2, filter_size, stride));
   return output_size;
 }
 
 void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                    "X(Input) of Pooling should not be null.");
-  PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                    "Out(Output) of Pooling should not be null.");
+  PADDLE_ENFORCE_EQ(
+      ctx->HasInput("X"), true,
+      platform::errors::NotFound("Input(X) of Pool operator is not found."));
+  PADDLE_ENFORCE_EQ(
+      ctx->HasOutput("Out"), true,
+      platform::errors::NotFound("Output(Out) of Pool operator is not found."));
 
   std::string pooling_type = ctx->Attrs().Get<std::string>("pooling_type");
   std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
@@ -65,28 +69,32 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
   auto in_x_dims = ctx->GetInputDim("X");
   PADDLE_ENFORCE_EQ(
       in_x_dims.size() == 4 || in_x_dims.size() == 5, true,
-      "ShapeError: the input of Op(pool) should be 4-D or 5-D Tensor. But "
-      "received: %u-D Tensor and it's shape is [%s].",
-      in_x_dims.size(), in_x_dims);
+      platform::errors::InvalidArgument(
+          "the input of Op(pool) should be 4-D or 5-D Tensor. But "
+          "received: %u-D Tensor and it's shape is [%s].",
+          in_x_dims.size(), in_x_dims));
 
   PADDLE_ENFORCE_EQ(
       in_x_dims.size() - ksize.size(), 2U,
-      "ShapeError: the dimension of input minus the size of "
-      "Attr(ksize) must be euqal to 2 in Op(pool). "
-      "But received: the dimension of input minus the size "
-      "of Attr(ksize) is %d, the "
-      "input's dimension is %d, the shape of input "
-      "is [%s], the Attr(ksize)'s size is %d, the Attr(ksize) is [%s].",
-      in_x_dims.size() - ksize.size(), in_x_dims.size(), in_x_dims,
-      ksize.size(), framework::make_ddim(ksize));
-
-  PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
-                    "ShapeError: the size of Attr(ksize) and Attr(strides) in "
-                    "Op(pool) must be equal. "
-                    "But received: Attr(ksize)'s size is %d, Attr(strides)'s "
-                    "size is %d, Attr(ksize) is [%s], Attr(strides)is [%s].",
-                    ksize.size(), strides.size(), framework::make_ddim(ksize),
-                    framework::make_ddim(strides));
+      platform::errors::InvalidArgument(
+          "the dimension of input minus the size of "
+          "Attr(ksize) must be euqal to 2 in Op(pool). "
+          "But received: the dimension of input minus the size "
+          "of Attr(ksize) is %d, the "
+          "input's dimension is %d, the shape of input "
+          "is [%s], the Attr(ksize)'s size is %d, the Attr(ksize) is [%s].",
+          in_x_dims.size() - ksize.size(), in_x_dims.size(), in_x_dims,
+          ksize.size(), framework::make_ddim(ksize)));
+
+  PADDLE_ENFORCE_EQ(
+      ksize.size(), strides.size(),
+      platform::errors::InvalidArgument(
+          "the size of Attr(ksize) and Attr(strides) in "
+          "Op(pool) must be equal. "
+          "But received: Attr(ksize)'s size is %d, Attr(strides)'s "
+          "size is %d, Attr(ksize) is [%s], Attr(strides)is [%s].",
+          ksize.size(), strides.size(), framework::make_ddim(ksize),
+          framework::make_ddim(strides)));
 
   // MKL-DNN Kernels are using NCHW order of dims description
   // so we ignore data_format consideration for MKL-DNN kernel
@@ -182,9 +190,12 @@ framework::OpKernelType PoolOp::GetKernelTypeForVar(
 }
 
 void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) must not be null.");
+  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
+                    platform::errors::NotFound(
+                        "Input(X) of Pool Gradoperator is not found."));
   PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
-                    "Input(X@GRAD) should not be null.");
+                    platform::errors::NotFound(
+                        "Input(X@GRAD) of Pool Gradoperator is not found."));
   ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
 }
 
@@ -210,7 +221,8 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
   auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
   if (input_data_type == framework::proto::VarType::FP16) {
     PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
-                      "float16 can only be used when CUDNN is used");
+                      platform::errors::InvalidArgument(
+                          "Float16 can only be used when CUDNN is used"));
   }
   return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
                                  library_);
diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
index 48fb6793d2af253cc8e14e0706ff6c45bfee11e1..677c724069cf49a4bf7c1dc298cf30828d7e6e71 100644
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -81,9 +81,11 @@ inline void UpdatePadding(std::vector<T>* paddings, const bool global_pooling,
       paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
     }
   } else {
-    PADDLE_ENFORCE_EQ(
-        data_dims.size() * 2, paddings->size(),
-        "Paddings size should be the same or twice as the pooling size.");
+    PADDLE_ENFORCE_EQ(data_dims.size() * 2, paddings->size(),
+                      platform::errors::InvalidArgument(
+                          "Paddings size %d should be the same or twice as the "
+                          "pooling size %d.",
+                          paddings->size(), data_dims.size() * 2));
   }
 
   // when padding_algorithm is "VALID" or "SAME"
@@ -200,7 +202,10 @@ class PoolKernel : public framework::OpKernel<T> {
                          pool_process, exclusive, adaptive, out);
         }
       } break;
-      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
+      default: {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Pool op only supports 2D and 3D input."));
+      }
     }
   }
 };
@@ -287,7 +292,10 @@ class PoolGradKernel : public framework::OpKernel<T> {
                             adaptive, in_x_grad);
           }
         } break;
-        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
+        default: {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Pool op only supports 2D and 3D input."));
+        }
       }
     }
   }
diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc
index 0371ea5b09bb30064af9dcc9f5a8a2d3a8a64fbf..3e44025e5b0e92d66d564d08062a090360a07ee8 100644
--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -46,8 +46,11 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
     std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
     bool adaptive = ctx->Attrs().Get<bool>("adaptive");
 
-    PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
-                   "Pooling intput should be 4-D or 5-D tensor.");
+    PADDLE_ENFORCE(
+        in_x_dims.size() == 4 || in_x_dims.size() == 5,
+        platform::errors::InvalidArgument("Pooling intput should be 4-D or 5-D "
+                                          "tensor but received %dD-Tensor",
+                                          in_x_dims.size()));
 
     if (ctx->Attrs().Get<bool>("global_pooling")) {
       ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
@@ -57,16 +60,21 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
       }
     }
 
-    PADDLE_ENFORCE_EQ(in_x_dims.size() - ksize.size(), 2U,
-                      platform::errors::InvalidArgument(
-                          "Input size and pooling size should be consistent."));
-    PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
-                      platform::errors::InvalidArgument(
-                          "Strides size and pooling size should be the same."));
+    PADDLE_ENFORCE_EQ(
+        in_x_dims.size() - ksize.size(), 2U,
+        platform::errors::InvalidArgument(
+            "The input size %d minus the kernel size %d should equal to 2.",
+            in_x_dims.size(), ksize.size()));
+    PADDLE_ENFORCE_EQ(
+        ksize.size(), strides.size(),
+        platform::errors::InvalidArgument(
+            "Strides size %d and pooling size %d should be the same.",
+            strides.size(), ksize.size()));
     PADDLE_ENFORCE_EQ(
         ksize.size(), paddings.size(),
         platform::errors::InvalidArgument(
-            "Paddings size and pooling size should be the same."));
+            "Paddings size %d and pooling size %d should be the same.",
+            paddings.size(), ksize.size()));
 
     std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
     if (adaptive) {
diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h
index a6bec121d4ff002ec80a0f47510e4431176e0ddc..065d90704cf77908fff21cfdfb2f57820be15169 100644
--- a/paddle/fluid/operators/pool_with_index_op.h
+++ b/paddle/fluid/operators/pool_with_index_op.h
@@ -61,7 +61,10 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
         pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out,
                        mask);
       } break;
-      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
+      default: {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Pool op only supports 2D and 3D input."));
+      }
     }
   }
 };
@@ -106,7 +109,10 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
           pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides,
                           paddings, adaptive, in_x_grad);
         } break;
-        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
+        default: {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Pool op only supports 2D and 3D input."));
+        }
       }
     }
   }
diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu
index 22fec3244fabe5ca466202784c0cce372d0bf6e5..748b6036008f1355bff2a46c6f19fe9f7e7a4dbd 100644
--- a/paddle/fluid/operators/psroi_pool_op.cu
+++ b/paddle/fluid/operators/psroi_pool_op.cu
@@ -176,22 +176,31 @@ class GPUPSROIPoolOpKernel : public framework::OpKernel<T> {
     int height = in_dims[2];
     int width = in_dims[3];
 
-    PADDLE_ENFORCE_EQ(input_channels,
-                      output_channels * pooled_height * pooled_width,
-                      "the channels of input X should equal the product of "
-                      "output_channels x pooled_height x pooled_width");
+    PADDLE_ENFORCE_EQ(
+        input_channels, output_channels * pooled_height * pooled_width,
+        platform::errors::InvalidArgument(
+            "The channels %d of input X should equal the product of "
+            "output_channels %d x pooled_height %d x pooled_width %d.",
+            input_channels, output_channels, pooled_height, pooled_width));
 
     int rois_num = rois->dims()[0];
     if (rois_num == 0) return;
 
     auto rois_lod = rois->lod().back();
     int rois_batch_size = rois_lod.size() - 1;
-    PADDLE_ENFORCE_EQ(
-        rois_batch_size, batch_size,
-        "The rois_batch_size and input(X) batch_size must be the same.");
+    PADDLE_ENFORCE_EQ(rois_batch_size, batch_size,
+                      platform::errors::InvalidArgument(
+                          "The batch size of input(ROIs) and input(X) must be "
+                          "the same but received batch size of input(ROIs) and "
+                          "input(X) is %d and %d respectively.",
+                          rois_batch_size, batch_size));
     int rois_num_with_lod = rois_lod[rois_batch_size];
     PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                      "The rois_num from input and lod must be the same.");
+                      platform::errors::InvalidArgument(
+                          "The number of rois from input(ROIs) and its LOD "
+                          "must be the same. Received rois %d of input(ROIs) "
+                          "but the number of rois %d from its LOD is %d",
+                          rois_num, rois_num_with_lod));
 
     // set rois batch id
     framework::Tensor rois_batch_id_list;
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
index 98d9ef6b6e11440d38abbedbfd93f6d3544d77bc..562ff8d576b7d6603b3b63a88f473a5cea5ed33d 100644
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -160,9 +160,14 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
     if (ctx.HasInput("RoisNum")) {
       auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
       int rois_batch_size = rois_num_t->numel();
+
       PADDLE_ENFORCE_EQ(
           rois_batch_size, batch_size,
-          "The rois_batch_size and imgs batch_size must be the same.");
+          platform::errors::InvalidArgument(
+              "The batch size of input(ROIs) and input(X) must be the same but "
+              "received batch size of input(ROIs) and input(X) is %d and %d "
+              "respectively.",
+              rois_batch_size, batch_size));
       std::vector<int> rois_num_list(rois_batch_size);
       memory::Copy(cplace, rois_num_list.data(), gplace,
                    rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
@@ -178,10 +183,19 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
       int rois_batch_size = rois_lod.size() - 1;
       PADDLE_ENFORCE_EQ(
           rois_batch_size, batch_size,
-          "The rois_batch_size and imgs batch_size must be the same.");
+          platform::errors::InvalidArgument(
+              "The batch size of input(ROIs) and input(X) must be the same but "
+              "received batch size of input(ROIs) and input(X) is %d and %d "
+              "respectively.",
+              rois_batch_size, batch_size));
+
       int rois_num_with_lod = rois_lod[rois_batch_size];
       PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                        "The rois_num from input and lod must be the same.");
+                        platform::errors::InvalidArgument(
+                            "The number of rois from input(ROIs) and its LOD "
+                            "must be the same. Received rois %d of input(ROIs) "
+                            "but the number of rois %d from its LOD is %d",
+                            rois_num, rois_num_with_lod));
       for (int n = 0; n < rois_batch_size; ++n) {
         for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
           roi_batch_id_data[i] = n;
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index e89cb1f5ec497133985ef8069e77e864cef47f52..ad5a942b53ec6215242e56a93e0f1a3adb2ceff2 100644
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -30,6 +30,7 @@ __all__ = [
 ]
 
 fleet = Fleet()
+_final_strategy = fleet._final_strategy
 init = fleet.init
 is_first_worker = fleet.is_first_worker
 worker_index = fleet.worker_index
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 316b6494e347ff0352e7faf0b607425c8cdd3b50..1fc29ad0428832d1a302fb996d689e69b36c4987 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -1244,8 +1244,7 @@ class DistributedStrategy(object):
                         if getattr(self.strategy, f.name):
                             draws += border + "\n"
                             draws += h1_format.format(
-                                "{} = True, please check {}_configs".format(
-                                    f.name, f.name))
+                                "{}=True <-> {}_configs".format(f.name, f.name))
                             draws += line + "\n"
                             my_configs = getattr(self.strategy,
                                                  f.name + "_configs")
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index d0658efdca34ec5a10a0e2f07dabc357096bfad5..3fdd6e92483031d55adcdddd24abe59fa8fb555b 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -119,6 +119,8 @@ class Fleet(object):
         self.strategy_compiler = None
         self._is_collective = False
         self._runtime_handle = None
+        self._util = None
+        self._context = {}
 
     def init(self, role_maker=None, is_collective=False):
         """
@@ -233,7 +235,7 @@ class Fleet(object):
 
         Returns:
             int: worker numbers
-
+        
         Examples:
             .. code-block:: python
 
@@ -569,8 +571,9 @@ class Fleet(object):
 
         if strategy == None:
             strategy = DistributedStrategy()
-        self.user_defined_strategy = strategy
-        self.valid_strategy = None
+
+        self._user_defined_strategy = copy.deepcopy(strategy)
+        self._context = {}
         return self
 
     @dygraph_only
@@ -909,6 +912,15 @@ class Fleet(object):
         # imitate target optimizer retrieval
         return self.user_defined_optimizer.clear_grad()
 
+    def _final_strategy(self):
+        if "valid_strategy" not in self._context:
+            print(
+                "WARNING: You may need to call minimize function before this function is called"
+            )
+            return {}
+        else:
+            return self._context["valid_strategy"]
+
     def minimize(self,
                  loss,
                  startup_program=None,
@@ -958,12 +970,15 @@ class Fleet(object):
                 # for more examples, please reference https://github.com/PaddlePaddle/FleetX
 
         """
+        context = {}
+        context["user_defined_strategy"] = copy.deepcopy(
+            self._user_defined_strategy)
         if paddle.fluid.framework.in_dygraph_mode():
             # imitate target optimizer retrieval
             target_opt = self.user_defined_optimizer
+            self._context = context
             return target_opt.minimize(loss)
 
-        context = {}
         # cache original feed forward program
         self.origin_main_program = loss.block.program
         context["origin_main_program"] = self.origin_main_program
@@ -984,17 +999,19 @@ class Fleet(object):
             MetaOptimizerFactory()._get_valid_meta_optimizers(
                 self.user_defined_optimizer)
 
-        context["user_defined_strategy"] = copy.copy(self.user_defined_strategy)
+        context["user_defined_strategy"] = copy.deepcopy(
+            self._user_defined_strategy)
+        copy_user_defined_strategy = copy.deepcopy(self._user_defined_strategy)
 
         # trigger the auto-parallel in very strict condition
         # strategy = DistributedStrategy()
         # strategy.auto = True
         # optimizer = paddle.optimizer.SGD(learning_rate=0.1)
         # optimizer = fleet.distributed_optimizer(optimizer, strategy)
-        if self.user_defined_strategy._is_strict_auto():
+        if copy_user_defined_strategy._is_strict_auto():
             # turn on all the strategy for each optimizer
             for opt in distributed_optimizer_list:
-                opt._enable_strategy(self.user_defined_strategy, context)
+                opt._enable_strategy(copy_user_defined_strategy, context)
 
         valid_optimizer_list = []
         valid_graph_optimizer_list = []
@@ -1003,7 +1020,7 @@ class Fleet(object):
         for opt in distributed_optimizer_list:
             opt._set_basic_info(loss, self._role_maker,
                                 self.user_defined_optimizer,
-                                self.user_defined_strategy)
+                                copy_user_defined_strategy)
             if opt._can_apply() and not opt._is_graph_out():
                 valid_optimizer_list.append(opt)
             elif opt._can_apply() and opt._is_graph_out():
@@ -1014,13 +1031,15 @@ class Fleet(object):
         meta_optimizer, graph_optimizer = \
             self.strategy_compiler.generate_optimizer(
                 loss, self._role_maker, self.user_defined_optimizer,
-                self.user_defined_strategy, valid_optimizer_list,
+                copy_user_defined_strategy, valid_optimizer_list,
                 valid_graph_optimizer_list)
 
         valid_strategy = self.strategy_compiler._get_valid_strategy(
-            self.user_defined_strategy, can_not_apply_optimizer_list)
+            copy_user_defined_strategy, can_not_apply_optimizer_list)
+
+        context["valid_strategy"] = copy.deepcopy(valid_strategy)
 
-        context["valid_strategy"] = valid_strategy
+        self._context = context
 
         self.valid_strategy = valid_strategy
         self.valid_strategy._enable_env()
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 478fecf74e4013e0d695c68af86a0e39a4a4e845..590d76ae1708f3dc17f048f3b1bf005f834d0bed 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -1291,17 +1291,17 @@ def append_backward(loss,
     It will be automatically invoked by the optimizer's `minimize` function.
 
     Parameters:
-        loss( :ref:`api_guide_Variable_en` ): The loss variable of the network.
-        parameter_list(list[Variable|str], optional): List of Parameters or Parameter.names
+        loss(Tensor): The loss Tensor of the network.
+        parameter_list(list[Tensor|str], optional): List of Parameters or Parameter.names
                                            that need to be updated by optimizers.
                                            If it is None, all parameters
                                            will be updated.
                                            Default: None.
-        no_grad_set(set[Variable|str], optional): Set of Variables or Variable.names in the :ref:`api_guide_Block_en` 0 whose gradients
-                               should be ignored. All variables with
+        no_grad_set(set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
+                               should be ignored. All Tensors with
                                `stop_gradient=True` from all blocks will
                                be automatically added into this set.
-                               If this parameter is not None, the Variables or Variable.names in this set will be added to the default set.
+                               If this parameter is not None, the Tensors or Tensor.names in this set will be added to the default set.
                                Default: None.
         callbacks(list[callable object], optional): List of callback functions.
                                                The callbacks are used for
@@ -1312,70 +1312,73 @@ def append_backward(loss,
                                                new gradient operator is added
                                                into the program. The callable
                                                object must have two input
-                                               parameters: 'block' and 'context'.
-                                               The 'block' is the :ref:`api_guide_Block_en` which
+                                               parameters: ``block`` and ``context`` .
+                                               The ``block`` is the :ref:`api_guide_Block_en` which
                                                the new gradient operator will
-                                               be added to. The 'context' is a
+                                               be added to. The ``context`` is a
                                                map, whose keys are gradient
-                                               variable names and values are
-                                               corresponding original :ref:`api_guide_Variable_en` .
-                                               In addition to this, the 'context'
+                                               Tensor names and values are
+                                               corresponding original :ref:`api_guide_tensor_en` .
+                                               In addition to this, the ``context``
                                                has another special key-value pair:
-                                               the key is string '__current_op_desc__'
+                                               the key is string ``__current_op_desc__``
                                                and the value is the op_desc of the
                                                gradient operator who has just
                                                triggered the callable object.
                                                Default: None.
 
     Returns:
-        list of tuple ( :ref:`api_guide_Variable_en` , :ref:`api_guide_Variable_en` ): Pairs of parameter and its corresponding gradients.
-        The key is the parameter and the value is gradient variable.
+        list of tuple ( :ref:`api_guide_tensor_en` , :ref:`api_guide_tensor_en` ): Pairs of parameter and its corresponding gradients.
+        The key is the parameter and the value is gradient Tensor.
 
     Raises:
-        AssertionError: If `loss` is not an instance of Variable.
+        AssertionError: If ``loss`` is not an instance of Tensor.
 
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
+            import paddle.nn.functional as F
 
-            x = fluid.data(name='x', shape=[None, 13], dtype='int64')
-            y = fluid.data(name='y', shape=[None, 1], dtype='float32')
-            x_emb = fluid.embedding(x, size=[100, 256])
-            y_predict = fluid.layers.fc(input=x_emb, size=1, act=None, name='my_fc')
-            loss = fluid.layers.square_error_cost(input=y_predict, label=y)
-            avg_loss = fluid.layers.mean(loss)
+            paddle.enable_static()
+
+            x = paddle.static.data(name='x', shape=[None, 13], dtype='int64')
+            y = paddle.static.data(name='y', shape=[None, 1], dtype='float32')
+            x_emb = paddle.static.nn.embedding(x, size=[100, 256])
+            y_predict = paddle.static.nn.fc(input=x_emb, size=1, act=None, name='my_fc')
+            loss = F.square_error_cost(input=y_predict, label=y)
+            avg_loss = paddle.mean(loss)
 
             # Get all weights in main_program, not include bias.
-            all_weights = [param for param in fluid.default_main_program().block(0).all_parameters() if 'w_' in param.name]
+            all_weights = [param for param in paddle.static.default_main_program().block(0).all_parameters() if 'w_' in param.name]
             all_weights_name = [w.name for w in all_weights]
 
             # return all param_grads needed to be updated if parameter_list set default None.
-            p_g_list1 = fluid.backward.append_backward(loss=avg_loss)
+            p_g_list1 = paddle.static.append_backward(loss=avg_loss)
             # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD), (my_fc.b_0, my_fc.b_0@GRAD)]
 
-            # return the param_grads corresponding to parameter_list that can be list of param (Variable).
-            p_g_list2 = fluid.backward.append_backward(loss=avg_loss, parameter_list=all_weights)
+            # return the param_grads corresponding to parameter_list that can be list of param (Tensor).
+            p_g_list2 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights)
             # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]
 
             # parameter_list can be list of param.name (str).
-            p_g_list3 = fluid.backward.append_backward(loss=avg_loss, parameter_list=all_weights_name)
+            p_g_list3 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights_name)
             # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]
 
-            # no_grad_set can be set of Variables that means grad will be cut off from these Variables.
-            p_g_list4 = fluid.backward.append_backward(loss=avg_loss, no_grad_set=set([x_emb]))
+            # no_grad_set can be set of Tensors that means grad will be cut off from these Tensors.
+            p_g_list4 = paddle.static.append_backward(loss=avg_loss, no_grad_set=set([x_emb]))
             # output: [(my_fc.w_0, my_fc.w_0@GRAD), (my_fc.b_0, my_fc.b_0@GRAD)]
 
-            # no_grad_set can be set of Variable.name when the Variable is created inside layers and can't be specified explicitly.
-            p_g_list5 = fluid.backward.append_backward(loss=avg_loss, no_grad_set=set(['my_fc.b_0']))
+            # no_grad_set can be set of Tensor.name when the Tensor is created inside layers and can't be specified explicitly.
+            p_g_list5 = paddle.static.append_backward(loss=avg_loss, no_grad_set=set(['my_fc.b_0']))
             # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]
 
             # return [] because all param_grads are filtered by no_grad_set.
-            p_g_list6 = fluid.backward.append_backward(loss=avg_loss, parameter_list=all_weights, no_grad_set=set(all_weights))
+            p_g_list6 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights, no_grad_set=set(all_weights))
 
     """
     check_type(loss, 'loss', framework.Variable,
-               'fluid.backward.append_backward')
+               'paddle.static.append_backward')
 
     if loss.op is None:
         # the loss is from a cloned program. Find loss op manually.
@@ -1387,7 +1390,7 @@ def append_backward(loss,
 
     if callbacks is not None:
         check_type(callbacks, 'callbacks', list,
-                   'fluid.backward.append_backward')
+                   'paddle.static.append_backward')
 
     program = loss.block.program
     root_block = program.block(0)
@@ -1727,21 +1730,21 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
     Backpropagate the gradients of targets to inputs.
 
     Args:
-        targets(Variable|list[Variable]): The target variables
-        inputs(Variable|list[Variable]): The input variables
-        target_gradients (Variable|list[Variable], optional): The gradient variables
+        targets(Tensor|list[Tensor]): The target Tensors
+        inputs(Tensor|list[Tensor]): The input Tensors
+        target_gradients (Tensor|list[Tensor], optional): The gradient Tensors
             of targets which has the same shape with targets, If None, ones will
             be created for them.
-        no_grad_set(set[Variable|str], optional): Set of Variables or Variable.names in the :ref:`api_guide_Block_en` 0 whose gradients
-                               should be ignored. All variables with
+        no_grad_set(set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
+                               should be ignored. All Tensors with
                                `stop_gradient=True` from all blocks will
                                be automatically added into this set.
-                               If this parameter is not None, the Variables or Variable.names in this set will be added to the default set.
+                               If this parameter is not None, the Tensors or Tensor.names in this set will be added to the default set.
                                Default: None.
 
     Return:
-        (list[Variable]): A list of gradients for inputs
-        If an input does not affect targets, the corresponding gradient variable
+        (list[Tensor]): A list of gradients for inputs
+        If an input does not affect targets, the corresponding gradient Tensor
         will be None
     """
     targets = _as_list(targets)
@@ -1865,41 +1868,42 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
     Backpropagate the gradients of targets to inputs.
 
     Args:
-        targets (Variable|list[Variable]): The target variables.
-        inputs (Variable|list[Variable]): The input variables.
-        target_gradients (Variable|list[Variable], optional): The gradient variables
+        targets (Tensor|list[Tensor]): The target Tensors.
+        inputs (Tensor|list[Tensor]): The input Tensors.
+        target_gradients (Tensor|list[Tensor], optional): The gradient Tensor
             of targets which has the same shape with targets, If None, ones will
             be created for them.
-        no_grad_set (set[Variable|str], optional): Set of Variables or Variable.names in the :ref:`api_guide_Block_en` 0 whose gradients
-            should be ignored. All variables with `stop_gradient=True` from all blocks will
-            be automatically added into this set. If this parameter is not None, the Variables or Variable.names
+        no_grad_set (set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
+            should be ignored. All Tensors with ``stop_gradient=True`` from all blocks will
+            be automatically added into this set. If this parameter is not None, the Tensors or Tensor.names
             in this set will be added to the default set. Default: None.
 
     Return:
-        (list[Variable]): A list of gradients for inputs
-        If an input does not affect targets, the corresponding gradient variable
+        (list[Tensor]): A list of gradients for inputs
+        If an input does not affect targets, the corresponding gradient Tensor
         will be None.
 
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
+            import paddle.nn.functional as F
+
+            paddle.enable_static()
 
-            x = fluid.data(name='x', shape=[None,2,8,8], dtype='float32')
+            x = paddle.static.data(name='x', shape=[None, 2, 8, 8], dtype='float32')
             x.stop_gradient=False
-            y = fluid.layers.conv2d(x, 4, 1, bias_attr=False)
-            y = fluid.layers.relu(y)
-            y = fluid.layers.conv2d(y, 4, 1, bias_attr=False)
-            y = fluid.layers.relu(y)
-            z = fluid.gradients([y], x)
-            print(z)
+            y = paddle.static.nn.conv2d(x, 4, 1, bias_attr=False)
+            y = F.relu(y)
+            z = paddle.static.gradients([y], x)
+            print(z) # [var x@GRAD : fluid.VarType.LOD_TENSOR.shape(-1L, 2L, 8L, 8L).astype(VarType.FP32)]
     """
     check_type(targets, 'targets', (framework.Variable, list),
-               'fluid.backward.gradients')
+               'paddle.static.gradients')
     check_type(inputs, 'inputs', (framework.Variable, list),
-               'fluid.backward.gradients')
+               'paddle.static.gradients')
     check_type(target_gradients, 'target_gradients', (
-        framework.Variable, list, type(None)), 'fluid.backward.gradients')
+        framework.Variable, list, type(None)), 'paddle.static.gradients')
 
     outs = calc_gradient(targets, inputs, target_gradients, no_grad_set)
     return _as_list(outs)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index b4cea6761dcd84e047f98929644a1e264976503d..8a84c584debd0d9fc0a5a6fd64b98de66fa3efa2 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -506,11 +506,12 @@ def name_scope(prefix=None):
     """
     :api_attr: Static Graph
 
-    Generate hierarchical name prefix for the operators.
+    Generate hierarchical name prefix for the operators in Static Graph.
 
     Note: 
         This should only used for debugging and visualization purpose.
         Don't use it for serious analysis such as graph/program transformations.
+        Don't use it in dygraph, since it will cause memory leak.
 
     Args:
         prefix(str, optional): prefix. Default is none.
@@ -518,21 +519,22 @@ def name_scope(prefix=None):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          with fluid.name_scope("s1"):
-             a = fluid.data(name='data', shape=[None, 1], dtype='int32')
+          import paddle
+          paddle.enable_static()
+          with paddle.static.name_scope("s1"):
+             a = paddle.data(name='data', shape=[None, 1], dtype='int32')
              b = a + 1
-             with fluid.name_scope("s2"):
+             with paddle.static.name_scope("s2"):
                 c = b * 1
-             with fluid.name_scope("s3"):
+             with paddle.static.name_scope("s3"):
                 d = c / 1
-          with fluid.name_scope("s1"):
-                f = fluid.layers.pow(d, 2.0)
-          with fluid.name_scope("s4"):
+          with paddle.static.name_scope("s1"):
+                f = paddle.tensor.pow(d, 2.0)
+          with paddle.static.name_scope("s4"):
                 g = f - 1
 
           # Op are created in the default main program.  
-          for op in fluid.default_main_program().block(0).ops:
+          for op in paddle.static.default_main_program().block(0).ops:
               # elementwise_add is created in /s1/
               if op.type == 'elementwise_add':
                   assert op.desc.attr("op_namescope") == '/s1/'
@@ -5396,13 +5398,13 @@ def program_guard(main_program, startup_program=None):
     """
     :api_attr: Static Graph
 
-    Change the global main program and startup program with `"with"` statement.
-    Layer functions in the Python `"with"` block will append operators and
-    variables to the new main programs.
+    Change the global main program and startup program with ``with`` statement.
+    Layer functions in the Python ``with`` block will append operators and
+    Tensors to the new main programs.
 
     Args:
-        main_program(Program): New main program inside `"with"` statement.
-        startup_program(Program, optional): New startup program inside `"with"` 
+        main_program(Program): New main program inside ``with`` statement.
+        startup_program(Program, optional): New startup program inside ``with`` 
             statement. :code:`None` means not changing startup program, 
             default_startup_program is still used.
             Default: None.
@@ -5410,13 +5412,14 @@ def program_guard(main_program, startup_program=None):
     Examples:
        .. code-block:: python
        
-         import paddle.fluid as fluid
+          import paddle
 
-         main_program = fluid.Program()
-         startup_program = fluid.Program()
-         with fluid.program_guard(main_program, startup_program):
-             data = fluid.data(name='image', shape=[None, 784, 784], dtype='float32')
-             hidden = fluid.layers.fc(input=data, size=10, act='relu')
+          paddle.enable_static()
+          main_program = paddle.static.Program()
+          startup_program = paddle.static.Program()
+          with paddle.static.program_guard(main_program, startup_program):
+              data = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
+              hidden = paddle.static.nn.fc(input=data, size=10, act='relu')
 
     Notes: The temporary :code:`Program` can be used if the user does not need
     to construct either of startup program or main program.
@@ -5424,20 +5427,22 @@ def program_guard(main_program, startup_program=None):
     Examples:
        .. code-block:: python
 
-         import paddle.fluid as fluid
+          import paddle
 
-         main_program = fluid.Program()
-         # does not care about startup program. Just pass a temporary value.
-         with fluid.program_guard(main_program, fluid.Program()):
-             data = fluid.data(name='image', shape=[None, 784, 784], dtype='float32')
+          paddle.enable_static()
+          main_program = paddle.static.Program()
+          # does not care about startup program. Just pass a temporary value.
+          with paddle.static.program_guard(main_program, paddle.static.Program()):
+              data = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
     
     """
     from .data_feeder import check_type
-    check_type(main_program, 'main_program', Program, 'fluid.program_guard')
+    check_type(main_program, 'main_program', Program,
+               'paddle.static.program_guard')
     main_program = switch_main_program(main_program)
     if startup_program is not None:
         check_type(startup_program, 'startup_program', Program,
-                   'fluid.program_guard')
+                   'paddle.static.program_guard')
         startup_program = switch_startup_program(startup_program)
     try:
         yield
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 3e7d10f8d1a02126c3d4bec490fcd2f3194123ee..3f99af1d37c614bc769d63eb03ff0e2175aa6ddd 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -9287,8 +9287,8 @@ def pad2d(input,
     than height-1. And the width dimension has the same condition.
 
     Parameters:
-        input (Variable): The input image with [N, C, H, W] format or [N, H, W, C] format, which is a 4-D Tensor with data type float32.
-        paddings (Variable | List[int32]): The padding size. If padding is a List, it must
+        input (Tensor): The input image with [N, C, H, W] format or [N, H, W, C] format, which is a 4-D Tensor with data type float32.
+        paddings (Tensor | List[int32]): The padding size. If padding is a List, it must
             contain four integers, (padding_top, padding_bottom, padding_left, padding_right).
             Otherwise, it is a 1-D Tensor with shape [4]. Data type is int32.
             Default is [0, 0, 0, 0].
@@ -9304,10 +9304,7 @@ def pad2d(input,
         name (str, optional) : The default value is None.  Normally there is no need for
                     user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
 
-    Returns: a 4-D Tensor padded according to paddings and mode and data type is same as input.
-
-    Return Type: Variable
-
+    Returns: Tensor, a 4-D Tensor padded according to paddings and mode and data type is same as input.
 
     Examples:
         .. code-block:: text
@@ -9340,9 +9337,33 @@ def pad2d(input,
     Code Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            data = fluid.data(name='data', shape=[None, 3, 32, 32], dtype='float32')
-            result = fluid.layers.pad2d(input=data, paddings=[0, 1, 2, 3], mode='reflect')
+            import numpy as np
+            import paddle
+            import paddle.nn.functional as F
+
+            # example 1
+            x_shape = (1, 1, 3, 4)
+            x = np.arange(np.prod(x_shape), dtype=np.float32).reshape(x_shape) + 1
+            tensor_x = paddle.to_tensor(x)
+            y = F.pad2d(tensor_x, paddings=[1, 2, 2, 1], pad_value=1, mode='constant')
+            print(y.numpy())
+            # [[[[ 1.  1.  1.  1.  1.  1.  1.]
+            #    [ 1.  1.  1.  2.  3.  4.  1.]
+            #    [ 1.  1.  5.  6.  7.  8.  1.]
+            #    [ 1.  1.  9. 10. 11. 12.  1.]
+            #    [ 1.  1.  1.  1.  1.  1.  1.]
+            #    [ 1.  1.  1.  1.  1.  1.  1.]]]]
+
+            # example 2
+            x_shape = (1, 1, 2, 3)
+            x = np.arange(np.prod(x_shape), dtype=np.float32).reshape(x_shape) + 1
+            tensor_x = paddle.to_tensor(x)
+            y = F.pad2d(tensor_x, paddings=[1, 1, 1, 1], mode='reflect')
+            print(y.numpy())
+            # [[[[5. 4. 5. 6. 5.]
+            #    [2. 1. 2. 3. 2.]
+            #    [5. 4. 5. 6. 5.]
+            #    [2. 1. 2. 3. 2.]]]]
     """
     check_variable_and_dtype(
         input, 'input', ['float16', 'float32', 'float64', 'int32', 'int64'],
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 97a3ebc2135a0649fff88e1a1c14d02dfb7850b1..5df32c5df42a1b48c917c04838542d36ed234b26 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -394,7 +394,8 @@ foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4)
-py_test_modules(test_warpctc_op MODULES test_warpctc_op)
+# disable test_warpctc_op
+# py_test_modules(test_warpctc_op MODULES test_warpctc_op)
 py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS ${GC_ENVS})
 py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS ${GC_ENVS})
 py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
index 5b7e0fb94c662f4aa47fbaad964e03c576c97807..b8393f1e28a947f58ea4e42f4ee673f86f1fcb45 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
@@ -60,8 +60,8 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
-        self.assertTrue(optimizer.user_defined_strategy.a_sync)
-        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
+        self.assertTrue(fleet._final_strategy().a_sync)
+        a_sync_configs = fleet._final_strategy().a_sync_configs
         self.assertTrue(a_sync_configs['k_steps'] == 0)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
index 3dff9d0f9d82530cade09a737d448fca4bf4f960..49b34f059e8d899a9220831936e4e05ecea24a91 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
@@ -72,8 +72,8 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
-        self.assertTrue(optimizer.user_defined_strategy.a_sync)
-        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
+        self.assertTrue(fleet._final_strategy().a_sync)
+        a_sync_configs = fleet._final_strategy().a_sync_configs
         self.assertTrue(a_sync_configs['k_steps'] == 0)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
index bdfa3a9a7d57869466b895f23674b6e8ef83310f..334a4e028b2c4f5cdf7fab91bf58d20d43e57daa 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
@@ -60,8 +60,8 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
-        self.assertTrue(optimizer.user_defined_strategy.a_sync)
-        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
+        self.assertTrue(fleet._final_strategy().a_sync)
+        a_sync_configs = fleet._final_strategy().a_sync_configs
         self.assertTrue(a_sync_configs['k_steps'] == 800)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
index 73e014b35008ff5a0539c6a338755b9dc2cf68d4..362428631e68cc7ac88be93d7ba1ff449a035822 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
@@ -18,6 +18,8 @@ import unittest
 import paddle
 import os
 
+paddle.enable_static()
+
 
 class TestFleetAMPOptimizer(unittest.TestCase):
     def setUp(self):
@@ -55,6 +57,8 @@ class TestFleetAMPOptimizer(unittest.TestCase):
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
+        strategy = fleet._final_strategy()
+
         ops = [op.type for op in avg_cost.block.ops]
         self.assertIn('cast', ops)
         self.assertIn('check_finite_and_unscale', ops)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_auto.py b/python/paddle/fluid/tests/unittests/test_fleet_auto.py
index 020f2f4db382ef1277167d85917e8fdba9c83893..0a4e2f631d60cf1dad790d720c88074090a08ca8 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_auto.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_auto.py
@@ -18,6 +18,8 @@ import os
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
 
+paddle.enable_static()
+
 
 class TestDistributedStrategyAuto(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base.py b/python/paddle/fluid/tests/unittests/test_fleet_base.py
index ccd57c4d51529f2c178f6b7c6c8f98851b1e9e80..4945c158025b7e7337081160c8a156c90ce1dcd3 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py
@@ -167,6 +167,8 @@ class TestFleetDygraph(unittest.TestCase):
         state_dict = adam.state_dict()
         adam.set_state_dict(state_dict)
 
+        final_strategy = fleet._final_strategy()
+
 
 class TestFleetBaseSingleRunCollective(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
index ec055178d90c529080489218f3aca1a71311beea..022e0b99ce87113c0085addc0370594780d919f1 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
@@ -19,6 +19,8 @@ import os
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
 
+paddle.enable_static()
+
 
 class TestFleetLambMetaOptimizer(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
index 0a70710b4590e253463640634615c2d11ff36e9f..e4cc3682d1a24f80bb52d57c5734ac0686bfeb63 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
@@ -19,6 +19,8 @@ import os
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
 
+paddle.enable_static()
+
 
 class TestFleetLarsMetaOptimizer(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_mv_op.py b/python/paddle/fluid/tests/unittests/test_mv_op.py
index 6b930e59aa554c57ba1ecae2c01aaefabbe578e9..e0d23e7871fb231892dcdf9ed32bfd31d3967f58 100644
--- a/python/paddle/fluid/tests/unittests/test_mv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mv_op.py
@@ -20,6 +20,7 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
+from paddle.static import program_guard, Program
 from op_test import OpTest
 
 
@@ -37,7 +38,7 @@ class TestMVOp(OpTest):
         self.check_grad(['X', 'Vec'], 'Out')
 
     def init_config(self):
-        self.x = np.random.random((5, 100)).astype("float64")
+        self.x = np.random.random((2, 100)).astype("float64")
         self.vec = np.random.random((100)).astype("float64")
 
 
@@ -57,21 +58,36 @@ class TestMVAPI(unittest.TestCase):
         paddle.enable_static()
 
     def test_static_graph(self):
-        paddle.enable_static()
+        for x_stop_gradient in [False, True]:
+            for vec_stop_gradient in [False, True]:
+
+                paddle.enable_static()
+
+                train_program = Program()
+                startup_program = Program()
+
+                self.input_x = np.random.rand(5, 100).astype("float64")
+                self.input_vec = np.random.rand(100).astype("float64")
+
+                with program_guard(train_program, startup_program):
+                    data_x = paddle.static.data(
+                        "x", shape=[5, 100], dtype="float64")
+                    data_vec = paddle.static.data(
+                        "vec", shape=[100], dtype="float64")
+
+                    data_x.stop_gradient = x_stop_gradient
+                    data_vec.stop_gradient = vec_stop_gradient
+
+                    result_vec = paddle.mv(data_x, data_vec)
 
-        self.input_x = np.random.rand(5, 100).astype("float64")
-        self.input_vec = np.random.rand(100).astype("float64")
-
-        data_x = paddle.static.data("x", shape=[5, 100], dtype="float64")
-        data_vec = paddle.static.data("vec", shape=[100], dtype="float64")
-        result_vec = paddle.mv(data_x, data_vec)
-        self.place = paddle.CPUPlace()
-        exe = paddle.static.Executor(self.place)
-        res, = exe.run(feed={"x": self.input_x,
-                             "vec": self.input_vec},
-                       fetch_list=[result_vec])
-        z_expected = np.array(np.dot(self.input_x, self.input_vec))
-        self.assertTrue(np.allclose(res, z_expected))
+                    self.place = paddle.CPUPlace()
+                    exe = paddle.static.Executor(self.place)
+                    res, = exe.run(
+                        feed={"x": self.input_x,
+                              "vec": self.input_vec},
+                        fetch_list=[result_vec])
+                    z_expected = np.array(np.dot(self.input_x, self.input_vec))
+                    self.assertTrue(np.allclose(res, z_expected))
 
 
 class TestMVError(unittest.TestCase):
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 5cf4953933242292c6a732513dbee2164811dd35..55c9b70c8a21ba0eb98286a8b04a0c1747d3df78 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -30,7 +30,6 @@ from ...fluid.layers import nn, utils
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid.param_attr import ParamAttr
 from ...fluid.layer_helper import LayerHelper
-from .common import pad2d
 
 
 def _is_list_or_tuple(input):
diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py
index 095a34cb6fc68cda6900790141d226208b203f82..c9e49ce6149f02366b1c95c2a7dda29b7d50bb74 100644
--- a/python/paddle/optimizer/__init__.py
+++ b/python/paddle/optimizer/__init__.py
@@ -14,24 +14,20 @@
 
 __all__ = [
     'Adadelta', 'AdadeltaOptimizer', 'Adagrad', 'AdagradOptimizer', 'Adam',
-    'Adamax', 'AdamW', 'DecayedAdagrad', 'DecayedAdagradOptimizer',
-    'DGCMomentumOptimizer', 'Dpsgd', 'DpsgdOptimizer',
-    'ExponentialMovingAverage', 'Ftrl', 'FtrlOptimizer', 'LambOptimizer',
-    'LarsMomentum', 'LarsMomentumOptimizer', 'LookaheadOptimizer',
-    'ModelAverage', 'Momentum', 'MomentumOptimizer', 'PipelineOptimizer',
-    'RecomputeOptimizer', 'RMSProp', 'SGD', 'SGDOptimizer', 'Optimizer',
-    '_LRScheduler', 'NoamLR', 'PiecewiseLR', 'NaturalExpLR', 'InverseTimeLR',
-    'PolynomialLR', 'LinearLrWarmup', 'ExponentialLR', 'MultiStepLR', 'StepLR',
-    'LambdaLR', 'ReduceLROnPlateau', 'CosineAnnealingLR'
+    'Adamax', 'AdamW', 'DecayedAdagrad', 'DecayedAdagradOptimizer', 'Dpsgd',
+    'DpsgdOptimizer', 'ExponentialMovingAverage', 'Ftrl', 'FtrlOptimizer',
+    'LookaheadOptimizer', 'ModelAverage', 'Momentum', 'MomentumOptimizer',
+    'RMSProp', 'SGD', 'SGDOptimizer', 'Optimizer', '_LRScheduler', 'NoamLR',
+    'PiecewiseLR', 'NaturalExpLR', 'InverseTimeLR', 'PolynomialLR',
+    'LinearLrWarmup', 'ExponentialLR', 'MultiStepLR', 'StepLR', 'LambdaLR',
+    'ReduceLROnPlateau', 'CosineAnnealingLR'
 ]
 
 
 from ..fluid.optimizer import Momentum, Adagrad, Dpsgd, DecayedAdagrad, Ftrl,\
-            AdagradOptimizer,DpsgdOptimizer,\
-            DecayedAdagradOptimizer,FtrlOptimizer,AdadeltaOptimizer, \
-            ModelAverage, LarsMomentum, DGCMomentumOptimizer, LambOptimizer,\
-            ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, \
-            RecomputeOptimizer, LarsMomentumOptimizer
+            AdagradOptimizer, DpsgdOptimizer, DecayedAdagradOptimizer, \
+            FtrlOptimizer, AdadeltaOptimizer, ModelAverage, \
+            ExponentialMovingAverage, LookaheadOptimizer
 
 from .optimizer import Optimizer
 from .adam import Adam