Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add-transformer-generate_square_subsequent_mask-api

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add-transformer-generate_square_subsequent_mask-api
1a13420b · LiuChiaChi · bfd73890 · bf99bc4a · 1a13420b · 1a13420b
31 changed file
--- a/paddle/fluid/operators/center_loss_op.cu
+++ b/paddle/fluid/operators/center_loss_op.cu
@@ -30,8 +30,10 @@ __global__ void ComputeDifferent(T *centers_diff, const T *X, const T *centers,
  while (idy < K) {
    int64_t id = ids[idy];
-    PADDLE_ENFORCE(id >= 0, "received id:", id);
+    PADDLE_ENFORCE(id >= 0, "Id should larger than 0 but received id: %d.", id);
-    PADDLE_ENFORCE(id < N, "received id:", id);
+    PADDLE_ENFORCE(id < N, "Id should smaller than %d but received id: %d.", N,
+                   id);
    T *out = centers_diff + idy * D;
    const T *x = X + idy * D;
    const T *cent = centers + id * D;
@@ -52,8 +54,9 @@ __global__ void UpdateCenters(T *centers, T *centers_diff, const int64_t *ids,
  while (idy < K) {
    int count = 1;
    int64_t id = ids[idy];
-    PADDLE_ENFORCE(id >= 0, "received id:", id);
+    PADDLE_ENFORCE(id >= 0, "Id should larger than 0 but received id: %d.", id);
-    PADDLE_ENFORCE(id < N, "received id:", id);
+    PADDLE_ENFORCE(id < N, "Id should smaller than %d but received id: %d.", N,
+                   id);
    for (int i = 0; i < K; i++) {
      if (ids[i] == id) {

--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
@@ -69,8 +69,10 @@ template <typename T>
 class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                   "It must use CUDAPlace.");
+                      platform::errors::InvalidArgument(
+                          "CTCAlign operator CUDA kernel must use CUDAPlace "
+                          "rather than CPUPlace."));
    auto* input = ctx.Input<LoDTensor>("Input");
    auto* output = ctx.Output<LoDTensor>("Output");
    const int blank = ctx.Attr<int>("blank");

--- a/paddle/fluid/operators/ctc_align_op.h
+++ b/paddle/fluid/operators/ctc_align_op.h
@@ -72,8 +72,11 @@ class CTCAlignKernel : public framework::OpKernel<T> {
      // check input dims and lod
      PADDLE_ENFORCE_EQ(
          input_dims[0], static_cast<int64_t>(input_lod[level].back()),
-          "The first dimension of Input(Input) should be equal to "
+          platform::errors::InvalidArgument(
-          "the sum of all sequences' lengths.");
+              "The first dimension %d of CTCAlign operator Input(Input) should "
+              "be equal to "
+              "the sum of all sequences' lengths %d.",
+              input_dims[0], static_cast<int64_t>(input_lod[level].back())));
      const size_t num_sequences = input_lod[level].size() - 1;

--- a/paddle/fluid/operators/mv_op.cc
+++ b/paddle/fluid/operators/mv_op.cc
@@ -42,21 +42,21 @@ class MVOp : public framework::OperatorWithKernel {
    OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "mv");
    auto dim_x = context->GetInputDim("X");
-    auto dim_y = context->GetInputDim("Vec");
+    auto dim_vec = context->GetInputDim("Vec");
    PADDLE_ENFORCE_EQ(
        dim_x.size(), 2,
        platform::errors::InvalidArgument(
            "The rank of input X should be 2, but is %d", dim_x.size()));
    PADDLE_ENFORCE_EQ(
-        dim_y.size(), 1,
+        dim_vec.size(), 1,
        platform::errors::InvalidArgument(
-            "The rank of input Vec should be 1, but is %d", dim_y.size()));
+            "The rank of input Vec should be 1, but is %d", dim_vec.size()));
-    PADDLE_ENFORCE_EQ(dim_x[1] == dim_y[0], true,
+    PADDLE_ENFORCE_EQ(dim_x[1], dim_vec[0],
                      platform::errors::InvalidArgument(
-                          "The length of input X' second dim should equal the "
+                          "X's second dimension is expected to be equal to "
-                          "length of input Vec,"
+                          "Vec's first dimension"
-                          " but X[%d, %d], Vec[%d]",
+                          "but recieved X'shape = [%s], Vec's shape = [%s]",
-                          dim_x[0], dim_x[1], dim_y[0]));
+                          dim_x, dim_vec));
    framework::DDim dim_out = framework::make_ddim({dim_x[0]});

--- a/paddle/fluid/operators/mv_op.cu
+++ b/paddle/fluid/operators/mv_op.cu
@@ -19,8 +19,8 @@ namespace paddle {
 namespace operators {
 template <typename T>
-__global__ void MVGradCUDAKernel(const int m, const int n, const T *dout,
+__global__ void MVGradDxCUDAKernel(const int m, const int n, const T *dout,
-                                 const T *vec, T *dx) {
+                                   const T *vec, T *dx) {
  int idx = blockDim.x * blockIdx.x + threadIdx.x;
  for (; idx < m * n; idx += blockDim.x * gridDim.x) {
    int i = idx / n;
@@ -52,32 +52,31 @@ class MVGradKernel<platform::CUDADeviceContext, T>
    int m = dim_x[0];
    int n = dim_x[1];
-    dx->Resize(framework::make_ddim({m * n}));
    // get data ptr
    const T *x_data = x->data<T>();
    const T *vec_data = vec->data<T>();
    const T *dout_data = dout->data<T>();
-    T *dx_data = dx->mutable_data<T>(context.GetPlace());
-    T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
    auto &dev_ctx =
        context.template device_context<platform::CUDADeviceContext>();
    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
-    // calculate dx
    auto stream = context.cuda_device_context().stream();
    auto config = GetGpuLaunchConfig1D(dev_ctx, m * n);
-    MVGradCUDAKernel<
-        T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
-        m, n, dout_data, vec_data, dx_data);
-    dx->Resize(framework::make_ddim({m, n}));
+    if (dx) {
+      T *dx_data = dx->mutable_data<T>(context.GetPlace());
+      MVGradDxCUDAKernel<
+          T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
+          m, n, dout_data, vec_data, dx_data);
+    }
+    if (dvec) {
+      T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
-    // calculate dvec
+      blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
-    blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
+                static_cast<T>(0), dvec_data);
-              static_cast<T>(0), dvec_data);
+    }
  }
 };

--- a/paddle/fluid/operators/mv_op.h
+++ b/paddle/fluid/operators/mv_op.h
@@ -74,30 +74,30 @@ class MVGradKernel : public framework::OpKernel<T> {
    int m = dim_x[0];
    int n = dim_x[1];
-    dx->Resize(framework::make_ddim({m * n}));
    // get data ptr
    const T *x_data = x->data<T>();
    const T *vec_data = vec->data<T>();
    const T *dout_data = dout->data<T>();
-    T *dx_data = dx->mutable_data<T>(context.GetPlace());
+    if (dx) {
-    T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
+      T *dx_data = dx->mutable_data<T>(context.GetPlace());
-    auto &dev_ctx = context.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-    // calculate dx
+      for (int i = 0; i < m; ++i) {
-    for (int i = 0; i < m; ++i) {
+        for (int j = 0; j < n; ++j) {
-      for (int j = 0; j < n; ++j)
+          dx_data[i * n + j] = dout_data[i] * vec_data[j];
-        dx_data[i * n + j] = dout_data[i] * vec_data[j];
+        }
+      }
    }
-    dx->Resize(framework::make_ddim({m, n}));
+    if (dvec) {
+      T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
+      auto &dev_ctx = context.template device_context<DeviceContext>();
+      auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
-    // calculate dvec
+      blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
-    blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
+                static_cast<T>(0), dvec_data);
-              static_cast<T>(0), dvec_data);
+    }
  }
 };

--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -45,8 +45,10 @@ template <typename T>
 class PoolCUDNNOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
+    PADDLE_ENFORCE_EQ(
-                      "It must use CUDAPlace.");
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::InvalidArgument("Pool operator CUDA kernel must use "
+                                          "CUDAPlace rather than CPUPlace."));
    const Tensor *input = ctx.Input<Tensor>("X");
    Tensor *output = ctx.Output<Tensor>("Out");
@@ -175,8 +177,10 @@ template <typename T>
 class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
+    PADDLE_ENFORCE_EQ(
-                      "It must use CUDAPlace.");
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::InvalidArgument("Pool operator CUDA kernel must use "
+                                          "CUDAPlace rather than CPUPlace."));
    const Tensor *input = ctx.Input<Tensor>("X");
    const Tensor *output = ctx.Input<Tensor>("Out");

--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -38,18 +38,22 @@ int PoolOutputSize(int input_size, int filter_size, int padding_1,
  }
  PADDLE_ENFORCE_GT(
      output_size, 0,
-      "ShapeError: the output size must be greater than 0. But received: "
+      platform::errors::InvalidArgument(
-      "output_size = %d due to the settings of input_size(%d), padding(%d,%d), "
+          "the output size must be greater than 0. But received: "
-      "k_size(%d) and stride(%d). Please check again!",
+          "output_size = %d due to the settings of input_size(%d), "
-      output_size, input_size, padding_1, padding_2, filter_size, stride);
+          "padding(%d,%d), "
+          "k_size(%d) and stride(%d). Please check again!",
+          output_size, input_size, padding_1, padding_2, filter_size, stride));
  return output_size;
 }
 void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
+  PADDLE_ENFORCE_EQ(
-                    "X(Input) of Pooling should not be null.");
+      ctx->HasInput("X"), true,
-  PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
+      platform::errors::NotFound("Input(X) of Pool operator is not found."));
-                    "Out(Output) of Pooling should not be null.");
+  PADDLE_ENFORCE_EQ(
+      ctx->HasOutput("Out"), true,
+      platform::errors::NotFound("Output(Out) of Pool operator is not found."));
  std::string pooling_type = ctx->Attrs().Get<std::string>("pooling_type");
  std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
@@ -65,28 +69,32 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
  auto in_x_dims = ctx->GetInputDim("X");
  PADDLE_ENFORCE_EQ(
      in_x_dims.size() == 4 || in_x_dims.size() == 5, true,
-      "ShapeError: the input of Op(pool) should be 4-D or 5-D Tensor. But "
+      platform::errors::InvalidArgument(
-      "received: %u-D Tensor and it's shape is [%s].",
+          "the input of Op(pool) should be 4-D or 5-D Tensor. But "
-      in_x_dims.size(), in_x_dims);
+          "received: %u-D Tensor and it's shape is [%s].",
+          in_x_dims.size(), in_x_dims));
  PADDLE_ENFORCE_EQ(
      in_x_dims.size() - ksize.size(), 2U,
-      "ShapeError: the dimension of input minus the size of "
+      platform::errors::InvalidArgument(
-      "Attr(ksize) must be euqal to 2 in Op(pool). "
+          "the dimension of input minus the size of "
-      "But received: the dimension of input minus the size "
+          "Attr(ksize) must be euqal to 2 in Op(pool). "
-      "of Attr(ksize) is %d, the "
+          "But received: the dimension of input minus the size "
-      "input's dimension is %d, the shape of input "
+          "of Attr(ksize) is %d, the "
-      "is [%s], the Attr(ksize)'s size is %d, the Attr(ksize) is [%s].",
+          "input's dimension is %d, the shape of input "
-      in_x_dims.size() - ksize.size(), in_x_dims.size(), in_x_dims,
+          "is [%s], the Attr(ksize)'s size is %d, the Attr(ksize) is [%s].",
-      ksize.size(), framework::make_ddim(ksize));
+          in_x_dims.size() - ksize.size(), in_x_dims.size(), in_x_dims,
+          ksize.size(), framework::make_ddim(ksize)));
-  PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
-                    "ShapeError: the size of Attr(ksize) and Attr(strides) in "
+  PADDLE_ENFORCE_EQ(
-                    "Op(pool) must be equal. "
+      ksize.size(), strides.size(),
-                    "But received: Attr(ksize)'s size is %d, Attr(strides)'s "
+      platform::errors::InvalidArgument(
-                    "size is %d, Attr(ksize) is [%s], Attr(strides)is [%s].",
+          "the size of Attr(ksize) and Attr(strides) in "
-                    ksize.size(), strides.size(), framework::make_ddim(ksize),
+          "Op(pool) must be equal. "
-                    framework::make_ddim(strides));
+          "But received: Attr(ksize)'s size is %d, Attr(strides)'s "
+          "size is %d, Attr(ksize) is [%s], Attr(strides)is [%s].",
+          ksize.size(), strides.size(), framework::make_ddim(ksize),
+          framework::make_ddim(strides)));
  // MKL-DNN Kernels are using NCHW order of dims description
  // so we ignore data_format consideration for MKL-DNN kernel
@@ -182,9 +190,12 @@ framework::OpKernelType PoolOp::GetKernelTypeForVar(
 }
 void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) must not be null.");
+  PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
+                    platform::errors::NotFound(
+                        "Input(X) of Pool Gradoperator is not found."));
  PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true,
-                    "Input(X@GRAD) should not be null.");
+                    platform::errors::NotFound(
+                        "Input(X@GRAD) of Pool Gradoperator is not found."));
  ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
 }
@@ -210,7 +221,8 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
  auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
  if (input_data_type == framework::proto::VarType::FP16) {
    PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
-                      "float16 can only be used when CUDNN is used");
+                      platform::errors::InvalidArgument(
+                          "Float16 can only be used when CUDNN is used"));
  }
  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
                                 library_);

--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -81,9 +81,11 @@ inline void UpdatePadding(std::vector<T>* paddings, const bool global_pooling,
      paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
    }
  } else {
-    PADDLE_ENFORCE_EQ(
+    PADDLE_ENFORCE_EQ(data_dims.size() * 2, paddings->size(),
-        data_dims.size() * 2, paddings->size(),
+                      platform::errors::InvalidArgument(
-        "Paddings size should be the same or twice as the pooling size.");
+                          "Paddings size %d should be the same or twice as the "
+                          "pooling size %d.",
+                          paddings->size(), data_dims.size() * 2));
  }
  // when padding_algorithm is "VALID" or "SAME"
@@ -200,7 +202,10 @@ class PoolKernel : public framework::OpKernel<T> {
                         pool_process, exclusive, adaptive, out);
        }
      } break;
-      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
+      default: {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Pool op only supports 2D and 3D input."));
+      }
    }
  }
 };
@@ -287,7 +292,10 @@ class PoolGradKernel : public framework::OpKernel<T> {
                            adaptive, in_x_grad);
          }
        } break;
-        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
+        default: {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Pool op only supports 2D and 3D input."));
+        }
      }
    }
  }

--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -46,8 +46,11 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
    bool adaptive = ctx->Attrs().Get<bool>("adaptive");
-    PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
+    PADDLE_ENFORCE(
-                   "Pooling intput should be 4-D or 5-D tensor.");
+        in_x_dims.size() == 4 || in_x_dims.size() == 5,
+        platform::errors::InvalidArgument("Pooling intput should be 4-D or 5-D "
+                                          "tensor but received %dD-Tensor",
+                                          in_x_dims.size()));
    if (ctx->Attrs().Get<bool>("global_pooling")) {
      ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
@@ -57,16 +60,21 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
      }
    }
-    PADDLE_ENFORCE_EQ(in_x_dims.size() - ksize.size(), 2U,
+    PADDLE_ENFORCE_EQ(
-                      platform::errors::InvalidArgument(
+        in_x_dims.size() - ksize.size(), 2U,
-                          "Input size and pooling size should be consistent."));
+        platform::errors::InvalidArgument(
-    PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
+            "The input size %d minus the kernel size %d should equal to 2.",
-                      platform::errors::InvalidArgument(
+            in_x_dims.size(), ksize.size()));
-                          "Strides size and pooling size should be the same."));
+    PADDLE_ENFORCE_EQ(
+        ksize.size(), strides.size(),
+        platform::errors::InvalidArgument(
+            "Strides size %d and pooling size %d should be the same.",
+            strides.size(), ksize.size()));
    PADDLE_ENFORCE_EQ(
        ksize.size(), paddings.size(),
        platform::errors::InvalidArgument(
-            "Paddings size and pooling size should be the same."));
+            "Paddings size %d and pooling size %d should be the same.",
+            paddings.size(), ksize.size()));
    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
    if (adaptive) {

--- a/paddle/fluid/operators/pool_with_index_op.h
+++ b/paddle/fluid/operators/pool_with_index_op.h
@@ -61,7 +61,10 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
        pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out,
                       mask);
      } break;
-      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
+      default: {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Pool op only supports 2D and 3D input."));
+      }
    }
  }
 };
@@ -106,7 +109,10 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
          pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides,
                          paddings, adaptive, in_x_grad);
        } break;
-        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
+        default: {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Pool op only supports 2D and 3D input."));
+        }
      }
    }
  }

--- a/paddle/fluid/operators/psroi_pool_op.cu
+++ b/paddle/fluid/operators/psroi_pool_op.cu
@@ -176,22 +176,31 @@ class GPUPSROIPoolOpKernel : public framework::OpKernel<T> {
    int height = in_dims[2];
    int width = in_dims[3];
-    PADDLE_ENFORCE_EQ(input_channels,
+    PADDLE_ENFORCE_EQ(
-                      output_channels * pooled_height * pooled_width,
+        input_channels, output_channels * pooled_height * pooled_width,
-                      "the channels of input X should equal the product of "
+        platform::errors::InvalidArgument(
-                      "output_channels x pooled_height x pooled_width");
+            "The channels %d of input X should equal the product of "
+            "output_channels %d x pooled_height %d x pooled_width %d.",
+            input_channels, output_channels, pooled_height, pooled_width));
    int rois_num = rois->dims()[0];
    if (rois_num == 0) return;
    auto rois_lod = rois->lod().back();
    int rois_batch_size = rois_lod.size() - 1;
-    PADDLE_ENFORCE_EQ(
+    PADDLE_ENFORCE_EQ(rois_batch_size, batch_size,
-        rois_batch_size, batch_size,
+                      platform::errors::InvalidArgument(
-        "The rois_batch_size and input(X) batch_size must be the same.");
+                          "The batch size of input(ROIs) and input(X) must be "
+                          "the same but received batch size of input(ROIs) and "
+                          "input(X) is %d and %d respectively.",
+                          rois_batch_size, batch_size));
    int rois_num_with_lod = rois_lod[rois_batch_size];
    PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                      "The rois_num from input and lod must be the same.");
+                      platform::errors::InvalidArgument(
+                          "The number of rois from input(ROIs) and its LOD "
+                          "must be the same. Received rois %d of input(ROIs) "
+                          "but the number of rois %d from its LOD is %d",
+                          rois_num, rois_num_with_lod));
    // set rois batch id
    framework::Tensor rois_batch_id_list;

--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -160,9 +160,14 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
    if (ctx.HasInput("RoisNum")) {
      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
      int rois_batch_size = rois_num_t->numel();
      PADDLE_ENFORCE_EQ(
          rois_batch_size, batch_size,
-          "The rois_batch_size and imgs batch_size must be the same.");
+          platform::errors::InvalidArgument(
+              "The batch size of input(ROIs) and input(X) must be the same but "
+              "received batch size of input(ROIs) and input(X) is %d and %d "
+              "respectively.",
+              rois_batch_size, batch_size));
      std::vector<int> rois_num_list(rois_batch_size);
      memory::Copy(cplace, rois_num_list.data(), gplace,
                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
@@ -178,10 +183,19 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
      int rois_batch_size = rois_lod.size() - 1;
      PADDLE_ENFORCE_EQ(
          rois_batch_size, batch_size,
-          "The rois_batch_size and imgs batch_size must be the same.");
+          platform::errors::InvalidArgument(
+              "The batch size of input(ROIs) and input(X) must be the same but "
+              "received batch size of input(ROIs) and input(X) is %d and %d "
+              "respectively.",
+              rois_batch_size, batch_size));
      int rois_num_with_lod = rois_lod[rois_batch_size];
      PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
-                        "The rois_num from input and lod must be the same.");
+                        platform::errors::InvalidArgument(
+                            "The number of rois from input(ROIs) and its LOD "
+                            "must be the same. Received rois %d of input(ROIs) "
+                            "but the number of rois %d from its LOD is %d",
+                            rois_num, rois_num_with_lod));
      for (int n = 0; n < rois_batch_size; ++n) {
        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
          roi_batch_id_data[i] = n;

--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -30,6 +30,7 @@ __all__ = [
 ]
 fleet = Fleet()
+_final_strategy = fleet._final_strategy
 init = fleet.init
 is_first_worker = fleet.is_first_worker
 worker_index = fleet.worker_index

--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -1244,8 +1244,7 @@ class DistributedStrategy(object):
                        if getattr(self.strategy, f.name):
                            draws += border + "\n"
                            draws += h1_format.format(
-                                "{} = True, please check {}_configs".format(
+                                "{}=True <-> {}_configs".format(f.name, f.name))
-                                    f.name, f.name))
                            draws += line + "\n"
                            my_configs = getattr(self.strategy,
                                                 f.name + "_configs")

--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -119,6 +119,8 @@ class Fleet(object):
        self.strategy_compiler = None
        self._is_collective = False
        self._runtime_handle = None
+        self._util = None
+        self._context = {}
    def init(self, role_maker=None, is_collective=False):
        """
@@ -233,7 +235,7 @@ class Fleet(object):
        Returns:
            int: worker numbers
        Examples:
            .. code-block:: python
@@ -569,8 +571,9 @@ class Fleet(object):
        if strategy == None:
            strategy = DistributedStrategy()
-        self.user_defined_strategy = strategy
-        self.valid_strategy = None
+        self._user_defined_strategy = copy.deepcopy(strategy)
+        self._context = {}
        return self
    @dygraph_only
@@ -909,6 +912,15 @@ class Fleet(object):
        # imitate target optimizer retrieval
        return self.user_defined_optimizer.clear_grad()
+    def _final_strategy(self):
+        if "valid_strategy" not in self._context:
+            print(
+                "WARNING: You may need to call minimize function before this function is called"
+            )
+            return {}
+        else:
+            return self._context["valid_strategy"]
    def minimize(self,
                 loss,
                 startup_program=None,
@@ -958,12 +970,15 @@ class Fleet(object):
                # for more examples, please reference https://github.com/PaddlePaddle/FleetX
        """
+        context = {}
+        context["user_defined_strategy"] = copy.deepcopy(
+            self._user_defined_strategy)
        if paddle.fluid.framework.in_dygraph_mode():
            # imitate target optimizer retrieval
            target_opt = self.user_defined_optimizer
+            self._context = context
            return target_opt.minimize(loss)
-        context = {}
        # cache original feed forward program
        self.origin_main_program = loss.block.program
        context["origin_main_program"] = self.origin_main_program
@@ -984,17 +999,19 @@ class Fleet(object):
            MetaOptimizerFactory()._get_valid_meta_optimizers(
                self.user_defined_optimizer)
-        context["user_defined_strategy"] = copy.copy(self.user_defined_strategy)
+        context["user_defined_strategy"] = copy.deepcopy(
+            self._user_defined_strategy)
+        copy_user_defined_strategy = copy.deepcopy(self._user_defined_strategy)
        # trigger the auto-parallel in very strict condition
        # strategy = DistributedStrategy()
        # strategy.auto = True
        # optimizer = paddle.optimizer.SGD(learning_rate=0.1)
        # optimizer = fleet.distributed_optimizer(optimizer, strategy)
-        if self.user_defined_strategy._is_strict_auto():
+        if copy_user_defined_strategy._is_strict_auto():
            # turn on all the strategy for each optimizer
            for opt in distributed_optimizer_list:
-                opt._enable_strategy(self.user_defined_strategy, context)
+                opt._enable_strategy(copy_user_defined_strategy, context)
        valid_optimizer_list = []
        valid_graph_optimizer_list = []
@@ -1003,7 +1020,7 @@ class Fleet(object):
        for opt in distributed_optimizer_list:
            opt._set_basic_info(loss, self._role_maker,
                                self.user_defined_optimizer,
-                                self.user_defined_strategy)
+                                copy_user_defined_strategy)
            if opt._can_apply() and not opt._is_graph_out():
                valid_optimizer_list.append(opt)
            elif opt._can_apply() and opt._is_graph_out():
@@ -1014,13 +1031,15 @@ class Fleet(object):
        meta_optimizer, graph_optimizer = \
            self.strategy_compiler.generate_optimizer(
                loss, self._role_maker, self.user_defined_optimizer,
-                self.user_defined_strategy, valid_optimizer_list,
+                copy_user_defined_strategy, valid_optimizer_list,
                valid_graph_optimizer_list)
        valid_strategy = self.strategy_compiler._get_valid_strategy(
-            self.user_defined_strategy, can_not_apply_optimizer_list)
+            copy_user_defined_strategy, can_not_apply_optimizer_list)
+        context["valid_strategy"] = copy.deepcopy(valid_strategy)
-        context["valid_strategy"] = valid_strategy
+        self._context = context
        self.valid_strategy = valid_strategy
        self.valid_strategy._enable_env()

--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -1291,17 +1291,17 @@ def append_backward(loss,
    It will be automatically invoked by the optimizer's `minimize` function.
    Parameters:
-        loss( :ref:`api_guide_Variable_en` ): The loss variable of the network.
+        loss(Tensor): The loss Tensor of the network.
-        parameter_list(list[Variable|str], optional): List of Parameters or Parameter.names
+        parameter_list(list[Tensor|str], optional): List of Parameters or Parameter.names
                                           that need to be updated by optimizers.
                                           If it is None, all parameters
                                           will be updated.
                                           Default: None.
-        no_grad_set(set[Variable|str], optional): Set of Variables or Variable.names in the :ref:`api_guide_Block_en` 0 whose gradients
+        no_grad_set(set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
-                               should be ignored. All variables with
+                               should be ignored. All Tensors with
                               `stop_gradient=True` from all blocks will
                               be automatically added into this set.
-                               If this parameter is not None, the Variables or Variable.names in this set will be added to the default set.
+                               If this parameter is not None, the Tensors or Tensor.names in this set will be added to the default set.
                               Default: None.
        callbacks(list[callable object], optional): List of callback functions.
                                               The callbacks are used for
@@ -1312,70 +1312,73 @@ def append_backward(loss,
                                               new gradient operator is added
                                               into the program. The callable
                                               object must have two input
-                                               parameters: 'block' and 'context'.
+                                               parameters: ``block`` and ``context`` .
-                                               The 'block' is the :ref:`api_guide_Block_en` which
+                                               The ``block`` is the :ref:`api_guide_Block_en` which
                                               the new gradient operator will
-                                               be added to. The 'context' is a
+                                               be added to. The ``context`` is a
                                               map, whose keys are gradient
-                                               variable names and values are
+                                               Tensor names and values are
-                                               corresponding original :ref:`api_guide_Variable_en` .
+                                               corresponding original :ref:`api_guide_tensor_en` .
-                                               In addition to this, the 'context'
+                                               In addition to this, the ``context``
                                               has another special key-value pair:
-                                               the key is string '__current_op_desc__'
+                                               the key is string ``__current_op_desc__``
                                               and the value is the op_desc of the
                                               gradient operator who has just
                                               triggered the callable object.
                                               Default: None.
    Returns:
-        list of tuple ( :ref:`api_guide_Variable_en` , :ref:`api_guide_Variable_en` ): Pairs of parameter and its corresponding gradients.
+        list of tuple ( :ref:`api_guide_tensor_en` , :ref:`api_guide_tensor_en` ): Pairs of parameter and its corresponding gradients.
-        The key is the parameter and the value is gradient variable.
+        The key is the parameter and the value is gradient Tensor.
    Raises:
-        AssertionError: If `loss` is not an instance of Variable.
+        AssertionError: If ``loss`` is not an instance of Tensor.
    Examples:
        .. code-block:: python
-            import paddle.fluid as fluid
+            import paddle
+            import paddle.nn.functional as F
-            x = fluid.data(name='x', shape=[None, 13], dtype='int64')
+            paddle.enable_static()
-            y = fluid.data(name='y', shape=[None, 1], dtype='float32')
-            x_emb = fluid.embedding(x, size=[100, 256])
+            x = paddle.static.data(name='x', shape=[None, 13], dtype='int64')
-            y_predict = fluid.layers.fc(input=x_emb, size=1, act=None, name='my_fc')
+            y = paddle.static.data(name='y', shape=[None, 1], dtype='float32')
-            loss = fluid.layers.square_error_cost(input=y_predict, label=y)
+            x_emb = paddle.static.nn.embedding(x, size=[100, 256])
-            avg_loss = fluid.layers.mean(loss)
+            y_predict = paddle.static.nn.fc(input=x_emb, size=1, act=None, name='my_fc')
+            loss = F.square_error_cost(input=y_predict, label=y)
+            avg_loss = paddle.mean(loss)
            # Get all weights in main_program, not include bias.
-            all_weights = [param for param in fluid.default_main_program().block(0).all_parameters() if 'w_' in param.name]
+            all_weights = [param for param in paddle.static.default_main_program().block(0).all_parameters() if 'w_' in param.name]
            all_weights_name = [w.name for w in all_weights]
            # return all param_grads needed to be updated if parameter_list set default None.
-            p_g_list1 = fluid.backward.append_backward(loss=avg_loss)
+            p_g_list1 = paddle.static.append_backward(loss=avg_loss)
            # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD), (my_fc.b_0, my_fc.b_0@GRAD)]
-            # return the param_grads corresponding to parameter_list that can be list of param (Variable).
+            # return the param_grads corresponding to parameter_list that can be list of param (Tensor).
-            p_g_list2 = fluid.backward.append_backward(loss=avg_loss, parameter_list=all_weights)
+            p_g_list2 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights)
            # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]
            # parameter_list can be list of param.name (str).
-            p_g_list3 = fluid.backward.append_backward(loss=avg_loss, parameter_list=all_weights_name)
+            p_g_list3 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights_name)
            # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]
-            # no_grad_set can be set of Variables that means grad will be cut off from these Variables.
+            # no_grad_set can be set of Tensors that means grad will be cut off from these Tensors.
-            p_g_list4 = fluid.backward.append_backward(loss=avg_loss, no_grad_set=set([x_emb]))
+            p_g_list4 = paddle.static.append_backward(loss=avg_loss, no_grad_set=set([x_emb]))
            # output: [(my_fc.w_0, my_fc.w_0@GRAD), (my_fc.b_0, my_fc.b_0@GRAD)]
-            # no_grad_set can be set of Variable.name when the Variable is created inside layers and can't be specified explicitly.
+            # no_grad_set can be set of Tensor.name when the Tensor is created inside layers and can't be specified explicitly.
-            p_g_list5 = fluid.backward.append_backward(loss=avg_loss, no_grad_set=set(['my_fc.b_0']))
+            p_g_list5 = paddle.static.append_backward(loss=avg_loss, no_grad_set=set(['my_fc.b_0']))
            # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]
            # return [] because all param_grads are filtered by no_grad_set.
-            p_g_list6 = fluid.backward.append_backward(loss=avg_loss, parameter_list=all_weights, no_grad_set=set(all_weights))
+            p_g_list6 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights, no_grad_set=set(all_weights))
    """
    check_type(loss, 'loss', framework.Variable,
-               'fluid.backward.append_backward')
+               'paddle.static.append_backward')
    if loss.op is None:
        # the loss is from a cloned program. Find loss op manually.
@@ -1387,7 +1390,7 @@ def append_backward(loss,
    if callbacks is not None:
        check_type(callbacks, 'callbacks', list,
-                   'fluid.backward.append_backward')
+                   'paddle.static.append_backward')
    program = loss.block.program
    root_block = program.block(0)
@@ -1727,21 +1730,21 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
    Backpropagate the gradients of targets to inputs.
    Args:
-        targets(Variable|list[Variable]): The target variables
+        targets(Tensor|list[Tensor]): The target Tensors
-        inputs(Variable|list[Variable]): The input variables
+        inputs(Tensor|list[Tensor]): The input Tensors
-        target_gradients (Variable|list[Variable], optional): The gradient variables
+        target_gradients (Tensor|list[Tensor], optional): The gradient Tensors
            of targets which has the same shape with targets, If None, ones will
            be created for them.
-        no_grad_set(set[Variable|str], optional): Set of Variables or Variable.names in the :ref:`api_guide_Block_en` 0 whose gradients
+        no_grad_set(set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
-                               should be ignored. All variables with
+                               should be ignored. All Tensors with
                               `stop_gradient=True` from all blocks will
                               be automatically added into this set.
-                               If this parameter is not None, the Variables or Variable.names in this set will be added to the default set.
+                               If this parameter is not None, the Tensors or Tensor.names in this set will be added to the default set.
                               Default: None.
    Return:
-        (list[Variable]): A list of gradients for inputs
+        (list[Tensor]): A list of gradients for inputs
-        If an input does not affect targets, the corresponding gradient variable
+        If an input does not affect targets, the corresponding gradient Tensor
        will be None
    """
    targets = _as_list(targets)
@@ -1865,41 +1868,42 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
    Backpropagate the gradients of targets to inputs.
    Args:
-        targets (Variable|list[Variable]): The target variables.
+        targets (Tensor|list[Tensor]): The target Tensors.
-        inputs (Variable|list[Variable]): The input variables.
+        inputs (Tensor|list[Tensor]): The input Tensors.
-        target_gradients (Variable|list[Variable], optional): The gradient variables
+        target_gradients (Tensor|list[Tensor], optional): The gradient Tensor
            of targets which has the same shape with targets, If None, ones will
            be created for them.
-        no_grad_set (set[Variable|str], optional): Set of Variables or Variable.names in the :ref:`api_guide_Block_en` 0 whose gradients
+        no_grad_set (set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
-            should be ignored. All variables with `stop_gradient=True` from all blocks will
+            should be ignored. All Tensors with ``stop_gradient=True`` from all blocks will
-            be automatically added into this set. If this parameter is not None, the Variables or Variable.names
+            be automatically added into this set. If this parameter is not None, the Tensors or Tensor.names
            in this set will be added to the default set. Default: None.
    Return:
-        (list[Variable]): A list of gradients for inputs
+        (list[Tensor]): A list of gradients for inputs
-        If an input does not affect targets, the corresponding gradient variable
+        If an input does not affect targets, the corresponding gradient Tensor
        will be None.
    Examples:
        .. code-block:: python
-            import paddle.fluid as fluid
+            import paddle
+            import paddle.nn.functional as F
+            paddle.enable_static()
-            x = fluid.data(name='x', shape=[None,2,8,8], dtype='float32')
+            x = paddle.static.data(name='x', shape=[None, 2, 8, 8], dtype='float32')
            x.stop_gradient=False
-            y = fluid.layers.conv2d(x, 4, 1, bias_attr=False)
+            y = paddle.static.nn.conv2d(x, 4, 1, bias_attr=False)
-            y = fluid.layers.relu(y)
+            y = F.relu(y)
-            y = fluid.layers.conv2d(y, 4, 1, bias_attr=False)
+            z = paddle.static.gradients([y], x)
-            y = fluid.layers.relu(y)
+            print(z) # [var x@GRAD : fluid.VarType.LOD_TENSOR.shape(-1L, 2L, 8L, 8L).astype(VarType.FP32)]
-            z = fluid.gradients([y], x)
-            print(z)
    """
    check_type(targets, 'targets', (framework.Variable, list),
-               'fluid.backward.gradients')
+               'paddle.static.gradients')
    check_type(inputs, 'inputs', (framework.Variable, list),
-               'fluid.backward.gradients')
+               'paddle.static.gradients')
    check_type(target_gradients, 'target_gradients', (
-        framework.Variable, list, type(None)), 'fluid.backward.gradients')
+        framework.Variable, list, type(None)), 'paddle.static.gradients')
    outs = calc_gradient(targets, inputs, target_gradients, no_grad_set)
    return _as_list(outs)
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -506,11 +506,12 @@ def name_scope(prefix=None):
    """
    :api_attr: Static Graph
-    Generate hierarchical name prefix for the operators.
+    Generate hierarchical name prefix for the operators in Static Graph.
    Note: 
        This should only used for debugging and visualization purpose.
        Don't use it for serious analysis such as graph/program transformations.
+        Don't use it in dygraph, since it will cause memory leak.
    Args:
        prefix(str, optional): prefix. Default is none.
@@ -518,21 +519,22 @@ def name_scope(prefix=None):
    Examples:
        .. code-block:: python
-          import paddle.fluid as fluid
+          import paddle
-          with fluid.name_scope("s1"):
+          paddle.enable_static()
-             a = fluid.data(name='data', shape=[None, 1], dtype='int32')
+          with paddle.static.name_scope("s1"):
+             a = paddle.data(name='data', shape=[None, 1], dtype='int32')
             b = a + 1
-             with fluid.name_scope("s2"):
+             with paddle.static.name_scope("s2"):
                c = b * 1
-             with fluid.name_scope("s3"):
+             with paddle.static.name_scope("s3"):
                d = c / 1
-          with fluid.name_scope("s1"):
+          with paddle.static.name_scope("s1"):
-                f = fluid.layers.pow(d, 2.0)
+                f = paddle.tensor.pow(d, 2.0)
-          with fluid.name_scope("s4"):
+          with paddle.static.name_scope("s4"):
                g = f - 1
          # Op are created in the default main program.  
-          for op in fluid.default_main_program().block(0).ops:
+          for op in paddle.static.default_main_program().block(0).ops:
              # elementwise_add is created in /s1/
              if op.type == 'elementwise_add':
                  assert op.desc.attr("op_namescope") == '/s1/'
@@ -5396,13 +5398,13 @@ def program_guard(main_program, startup_program=None):
    """
    :api_attr: Static Graph
-    Change the global main program and startup program with `"with"` statement.
+    Change the global main program and startup program with ``with`` statement.
-    Layer functions in the Python `"with"` block will append operators and
+    Layer functions in the Python ``with`` block will append operators and
-    variables to the new main programs.
+    Tensors to the new main programs.
    Args:
-        main_program(Program): New main program inside `"with"` statement.
+        main_program(Program): New main program inside ``with`` statement.
-        startup_program(Program, optional): New startup program inside `"with"` 
+        startup_program(Program, optional): New startup program inside ``with`` 
            statement. :code:`None` means not changing startup program, 
            default_startup_program is still used.
            Default: None.
@@ -5410,13 +5412,14 @@ def program_guard(main_program, startup_program=None):
    Examples:
       .. code-block:: python
-         import paddle.fluid as fluid
+          import paddle
-         main_program = fluid.Program()
+          paddle.enable_static()
-         startup_program = fluid.Program()
+          main_program = paddle.static.Program()
-         with fluid.program_guard(main_program, startup_program):
+          startup_program = paddle.static.Program()
-             data = fluid.data(name='image', shape=[None, 784, 784], dtype='float32')
+          with paddle.static.program_guard(main_program, startup_program):
-             hidden = fluid.layers.fc(input=data, size=10, act='relu')
+              data = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
+              hidden = paddle.static.nn.fc(input=data, size=10, act='relu')
    Notes: The temporary :code:`Program` can be used if the user does not need
    to construct either of startup program or main program.
@@ -5424,20 +5427,22 @@ def program_guard(main_program, startup_program=None):
    Examples:
       .. code-block:: python
-         import paddle.fluid as fluid
+          import paddle
-         main_program = fluid.Program()
+          paddle.enable_static()
-         # does not care about startup program. Just pass a temporary value.
+          main_program = paddle.static.Program()
-         with fluid.program_guard(main_program, fluid.Program()):
+          # does not care about startup program. Just pass a temporary value.
-             data = fluid.data(name='image', shape=[None, 784, 784], dtype='float32')
+          with paddle.static.program_guard(main_program, paddle.static.Program()):
+              data = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
    """
    from .data_feeder import check_type
-    check_type(main_program, 'main_program', Program, 'fluid.program_guard')
+    check_type(main_program, 'main_program', Program,
+               'paddle.static.program_guard')
    main_program = switch_main_program(main_program)
    if startup_program is not None:
        check_type(startup_program, 'startup_program', Program,
-                   'fluid.program_guard')
+                   'paddle.static.program_guard')
        startup_program = switch_startup_program(startup_program)
    try:
        yield

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -9287,8 +9287,8 @@ def pad2d(input,
    than height-1. And the width dimension has the same condition.
    Parameters:
-        input (Variable): The input image with [N, C, H, W] format or [N, H, W, C] format, which is a 4-D Tensor with data type float32.
+        input (Tensor): The input image with [N, C, H, W] format or [N, H, W, C] format, which is a 4-D Tensor with data type float32.
-        paddings (Variable | List[int32]): The padding size. If padding is a List, it must
+        paddings (Tensor | List[int32]): The padding size. If padding is a List, it must
            contain four integers, (padding_top, padding_bottom, padding_left, padding_right).
            Otherwise, it is a 1-D Tensor with shape [4]. Data type is int32.
            Default is [0, 0, 0, 0].
@@ -9304,10 +9304,7 @@ def pad2d(input,
        name (str, optional) : The default value is None.  Normally there is no need for
                    user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
-    Returns: a 4-D Tensor padded according to paddings and mode and data type is same as input.
+    Returns: Tensor, a 4-D Tensor padded according to paddings and mode and data type is same as input.
-    Return Type: Variable
    Examples:
        .. code-block:: text
@@ -9340,9 +9337,33 @@ def pad2d(input,
    Code Examples:
        .. code-block:: python
-            import paddle.fluid as fluid
+            import numpy as np
-            data = fluid.data(name='data', shape=[None, 3, 32, 32], dtype='float32')
+            import paddle
-            result = fluid.layers.pad2d(input=data, paddings=[0, 1, 2, 3], mode='reflect')
+            import paddle.nn.functional as F
+            # example 1
+            x_shape = (1, 1, 3, 4)
+            x = np.arange(np.prod(x_shape), dtype=np.float32).reshape(x_shape) + 1
+            tensor_x = paddle.to_tensor(x)
+            y = F.pad2d(tensor_x, paddings=[1, 2, 2, 1], pad_value=1, mode='constant')
+            print(y.numpy())
+            # [[[[ 1.  1.  1.  1.  1.  1.  1.]
+            #    [ 1.  1.  1.  2.  3.  4.  1.]
+            #    [ 1.  1.  5.  6.  7.  8.  1.]
+            #    [ 1.  1.  9. 10. 11. 12.  1.]
+            #    [ 1.  1.  1.  1.  1.  1.  1.]
+            #    [ 1.  1.  1.  1.  1.  1.  1.]]]]
+            # example 2
+            x_shape = (1, 1, 2, 3)
+            x = np.arange(np.prod(x_shape), dtype=np.float32).reshape(x_shape) + 1
+            tensor_x = paddle.to_tensor(x)
+            y = F.pad2d(tensor_x, paddings=[1, 1, 1, 1], mode='reflect')
+            print(y.numpy())
+            # [[[[5. 4. 5. 6. 5.]
+            #    [2. 1. 2. 3. 2.]
+            #    [5. 4. 5. 6. 5.]
+            #    [2. 1. 2. 3. 2.]]]]
    """
    check_variable_and_dtype(
        input, 'input', ['float16', 'float32', 'float64', 'int32', 'int64'],

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -394,7 +394,8 @@ foreach(TEST_OP ${TEST_OPS})
    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4)
-py_test_modules(test_warpctc_op MODULES test_warpctc_op)
+# disable test_warpctc_op
+# py_test_modules(test_warpctc_op MODULES test_warpctc_op)
 py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS ${GC_ENVS})
 py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS ${GC_ENVS})
 py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS

--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
@@ -60,8 +60,8 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)
-        self.assertTrue(optimizer.user_defined_strategy.a_sync)
+        self.assertTrue(fleet._final_strategy().a_sync)
-        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
+        a_sync_configs = fleet._final_strategy().a_sync_configs
        self.assertTrue(a_sync_configs['k_steps'] == 0)

--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
@@ -72,8 +72,8 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)
-        self.assertTrue(optimizer.user_defined_strategy.a_sync)
+        self.assertTrue(fleet._final_strategy().a_sync)
-        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
+        a_sync_configs = fleet._final_strategy().a_sync_configs
        self.assertTrue(a_sync_configs['k_steps'] == 0)

--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
@@ -60,8 +60,8 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)
-        self.assertTrue(optimizer.user_defined_strategy.a_sync)
+        self.assertTrue(fleet._final_strategy().a_sync)
-        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
+        a_sync_configs = fleet._final_strategy().a_sync_configs
        self.assertTrue(a_sync_configs['k_steps'] == 800)

--- a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
@@ -18,6 +18,8 @@ import unittest
 import paddle
 import os
+paddle.enable_static()
 class TestFleetAMPOptimizer(unittest.TestCase):
    def setUp(self):
@@ -55,6 +57,8 @@ class TestFleetAMPOptimizer(unittest.TestCase):
        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
        optimizer.minimize(avg_cost)
+        strategy = fleet._final_strategy()
        ops = [op.type for op in avg_cost.block.ops]
        self.assertIn('cast', ops)
        self.assertIn('check_finite_and_unscale', ops)

--- a/python/paddle/fluid/tests/unittests/test_fleet_auto.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_auto.py
@@ -18,6 +18,8 @@ import os
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
+paddle.enable_static()
 class TestDistributedStrategyAuto(unittest.TestCase):
    def setUp(self):

--- a/python/paddle/fluid/tests/unittests/test_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py
@@ -167,6 +167,8 @@ class TestFleetDygraph(unittest.TestCase):
        state_dict = adam.state_dict()
        adam.set_state_dict(state_dict)
+        final_strategy = fleet._final_strategy()
 class TestFleetBaseSingleRunCollective(unittest.TestCase):
    def setUp(self):

--- a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
@@ -19,6 +19,8 @@ import os
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
+paddle.enable_static()
 class TestFleetLambMetaOptimizer(unittest.TestCase):
    def setUp(self):

--- a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
@@ -19,6 +19,8 @@ import os
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
+paddle.enable_static()
 class TestFleetLarsMetaOptimizer(unittest.TestCase):
    def setUp(self):

--- a/python/paddle/fluid/tests/unittests/test_mv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mv_op.py
@@ -20,6 +20,7 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
+from paddle.static import program_guard, Program
 from op_test import OpTest
@@ -37,7 +38,7 @@ class TestMVOp(OpTest):
        self.check_grad(['X', 'Vec'], 'Out')
    def init_config(self):
-        self.x = np.random.random((5, 100)).astype("float64")
+        self.x = np.random.random((2, 100)).astype("float64")
        self.vec = np.random.random((100)).astype("float64")
@@ -57,21 +58,36 @@ class TestMVAPI(unittest.TestCase):
        paddle.enable_static()
    def test_static_graph(self):
-        paddle.enable_static()
+        for x_stop_gradient in [False, True]:
+            for vec_stop_gradient in [False, True]:
+                paddle.enable_static()
+                train_program = Program()
+                startup_program = Program()
+                self.input_x = np.random.rand(5, 100).astype("float64")
+                self.input_vec = np.random.rand(100).astype("float64")
+                with program_guard(train_program, startup_program):
+                    data_x = paddle.static.data(
+                        "x", shape=[5, 100], dtype="float64")
+                    data_vec = paddle.static.data(
+                        "vec", shape=[100], dtype="float64")
+                    data_x.stop_gradient = x_stop_gradient
+                    data_vec.stop_gradient = vec_stop_gradient
+                    result_vec = paddle.mv(data_x, data_vec)
-        self.input_x = np.random.rand(5, 100).astype("float64")
+                    self.place = paddle.CPUPlace()
-        self.input_vec = np.random.rand(100).astype("float64")
+                    exe = paddle.static.Executor(self.place)
+                    res, = exe.run(
-        data_x = paddle.static.data("x", shape=[5, 100], dtype="float64")
+                        feed={"x": self.input_x,
-        data_vec = paddle.static.data("vec", shape=[100], dtype="float64")
+                              "vec": self.input_vec},
-        result_vec = paddle.mv(data_x, data_vec)
+                        fetch_list=[result_vec])
-        self.place = paddle.CPUPlace()
+                    z_expected = np.array(np.dot(self.input_x, self.input_vec))
-        exe = paddle.static.Executor(self.place)
+                    self.assertTrue(np.allclose(res, z_expected))
-        res, = exe.run(feed={"x": self.input_x,
-                             "vec": self.input_vec},
-                       fetch_list=[result_vec])
-        z_expected = np.array(np.dot(self.input_x, self.input_vec))
-        self.assertTrue(np.allclose(res, z_expected))
 class TestMVError(unittest.TestCase):

--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -30,7 +30,6 @@ from ...fluid.layers import nn, utils
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid.param_attr import ParamAttr
 from ...fluid.layer_helper import LayerHelper
-from .common import pad2d
 def _is_list_or_tuple(input):

--- a/python/paddle/optimizer/__init__.py
+++ b/python/paddle/optimizer/__init__.py
@@ -14,24 +14,20 @@
 __all__ = [
    'Adadelta', 'AdadeltaOptimizer', 'Adagrad', 'AdagradOptimizer', 'Adam',
-    'Adamax', 'AdamW', 'DecayedAdagrad', 'DecayedAdagradOptimizer',
+    'Adamax', 'AdamW', 'DecayedAdagrad', 'DecayedAdagradOptimizer', 'Dpsgd',
-    'DGCMomentumOptimizer', 'Dpsgd', 'DpsgdOptimizer',
+    'DpsgdOptimizer', 'ExponentialMovingAverage', 'Ftrl', 'FtrlOptimizer',
-    'ExponentialMovingAverage', 'Ftrl', 'FtrlOptimizer', 'LambOptimizer',
+    'LookaheadOptimizer', 'ModelAverage', 'Momentum', 'MomentumOptimizer',
-    'LarsMomentum', 'LarsMomentumOptimizer', 'LookaheadOptimizer',
+    'RMSProp', 'SGD', 'SGDOptimizer', 'Optimizer', '_LRScheduler', 'NoamLR',
-    'ModelAverage', 'Momentum', 'MomentumOptimizer', 'PipelineOptimizer',
+    'PiecewiseLR', 'NaturalExpLR', 'InverseTimeLR', 'PolynomialLR',
-    'RecomputeOptimizer', 'RMSProp', 'SGD', 'SGDOptimizer', 'Optimizer',
+    'LinearLrWarmup', 'ExponentialLR', 'MultiStepLR', 'StepLR', 'LambdaLR',
-    '_LRScheduler', 'NoamLR', 'PiecewiseLR', 'NaturalExpLR', 'InverseTimeLR',
+    'ReduceLROnPlateau', 'CosineAnnealingLR'
-    'PolynomialLR', 'LinearLrWarmup', 'ExponentialLR', 'MultiStepLR', 'StepLR',
-    'LambdaLR', 'ReduceLROnPlateau', 'CosineAnnealingLR'
 ]
 from ..fluid.optimizer import Momentum, Adagrad, Dpsgd, DecayedAdagrad, Ftrl,\
-            AdagradOptimizer,DpsgdOptimizer,\
+            AdagradOptimizer, DpsgdOptimizer, DecayedAdagradOptimizer, \
-            DecayedAdagradOptimizer,FtrlOptimizer,AdadeltaOptimizer, \
+            FtrlOptimizer, AdadeltaOptimizer, ModelAverage, \
-            ModelAverage, LarsMomentum, DGCMomentumOptimizer, LambOptimizer,\
+            ExponentialMovingAverage, LookaheadOptimizer
-            ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, \
-            RecomputeOptimizer, LarsMomentumOptimizer
 from .optimizer import Optimizer
 from .adam import Adam