diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index 544c014eaf98a99b1737809f2cbad39b46fdb276..0b22bab26789a3e2ebd20428adc236faa8b38dee 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -19,13 +19,17 @@ namespace paddle {
 namespace framework {
 extern size_t SizeOfType(proto::VarType::Type type);
 void Tensor::check_memory_size() const {
-  PADDLE_ENFORCE_NOT_NULL(
-      holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
+  PADDLE_ENFORCE_NOT_NULL(holder_, platform::errors::PreconditionNotMet(
+                                       "Tensor holds no memory. "
+                                       "Call Tensor::mutable_data firstly."));
   PADDLE_ENFORCE_LE(
       numel() * SizeOfType(type()), memory_size(),
-      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
-      "first to re-allocate memory.\n"
-      "or maybe the required data-type mismatches the data already stored.");
+      platform::errors::PreconditionNotMet(
+          "Tensor's dimension is out of bound."
+          "Tensor's dimension must be equal or less than the size of its "
+          "memory."
+          "But received  Tensor's dimension is d%, memory's size is %d.",
+          numel() * SizeOfType(type()), memory_size()));
 }
 
 Tensor::Tensor(const proto::VarType::Type& dtype) : type_(dtype), offset_(0) {}
@@ -37,15 +41,21 @@ size_t Tensor::memory_size() const {
 void* Tensor::mutable_data(const platform::Place& place,
                            proto::VarType::Type type, size_t requested_size) {
   type_ = type;
-  PADDLE_ENFORCE_GE(numel(), 0,
-                    "When calling this method, the Tensor's numel must be "
-                    "equal or larger than zero. "
-                    "Please check Tensor::dims, or Tensor::Resize has been "
-                    "called first. The Tensor's shape is [",
-                    dims(), "] now");
+  PADDLE_ENFORCE_GE(
+      numel(), 0,
+      platform::errors::PreconditionNotMet(
+          "The Tensor's element number must be equal or greater than zero. "
+          "The Tensor's shape is [",
+          dims(), "] now"));
   size_t size = numel() * SizeOfType(type);
   if (requested_size) {
-    PADDLE_ENFORCE_GE(requested_size, size);
+    PADDLE_ENFORCE_GE(
+        requested_size, size,
+        platform::errors::InvalidArgument(
+            "The requested memory size is less than the memory size of Tensor. "
+            "But received requested memory size is d%, "
+            "memory size of Tensor is %d.",
+            requested_size, size));
     size = requested_size;
   }
   /* some versions of boost::variant don't have operator!= */
@@ -62,8 +72,8 @@ void* Tensor::mutable_data(const platform::Place& place,
 
 void* Tensor::mutable_data(const platform::Place& place,
                            size_t requested_size) {
-  PADDLE_ENFORCE_NOT_NULL(
-      this->holder_, "Cannot invoke mutable data if current hold nothing.");
+  PADDLE_ENFORCE_NOT_NULL(this->holder_, platform::errors::PreconditionNotMet(
+                                             "The tensor is not initialized."));
   return mutable_data(place, type_, requested_size);
 }
 
@@ -75,12 +85,20 @@ Tensor& Tensor::ShareDataWith(const Tensor& src) {
 
 Tensor Tensor::Slice(int64_t begin_idx, int64_t end_idx) const {
   check_memory_size();
-  PADDLE_ENFORCE_GE(begin_idx, 0,
-                    "The start row index must be greater than 0.");
-  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
+  PADDLE_ENFORCE_GE(
+      begin_idx, 0,
+      platform::errors::OutOfRange("The start row index must be greater than 0."
+                                   "But received the start index is d%.",
+                                   begin_idx));
+  PADDLE_ENFORCE_LE(
+      end_idx, dims_[0],
+      platform::errors::OutOfRange("The end row index is out of bound."));
   PADDLE_ENFORCE_LT(
       begin_idx, end_idx,
-      "The start row index must be lesser than the end row index.");
+      platform::errors::InvalidArgument(
+          "The start row index must be less than the end row index."
+          "But received the start index = %d, the end index = %d.",
+          begin_idx, end_idx));
 
   if (dims_[0] == 1) {
     return *this;
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index d9fddc4c77d99efcb119ae37591b26bff0e51c0a..f2ccff2c133a238d02e25c65faf41dd519fdb506 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -131,13 +131,17 @@ class Tensor {
 
   const platform::Place& place() const {
     PADDLE_ENFORCE_NOT_NULL(
-        holder_, "Tensor not initialized yet when Tensor::place() is called.");
+        holder_,
+        platform::errors::PreconditionNotMet(
+            "Tensor not initialized yet when Tensor::place() is called."));
     return holder_->place();
   }
 
   proto::VarType::Type type() const {
     PADDLE_ENFORCE_NOT_NULL(
-        holder_, "Tensor not initialized yet when Tensor::type() is called.");
+        holder_,
+        platform::errors::PreconditionNotMet(
+            "Tensor not initialized yet when Tensor::type() is called."));
     return type_;
   }
 
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index f5171b0a8d1efc33521c2d731e88f0f96bdf41c7..986551b935e8811a6b257c2b4a613a493b3b644b 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -43,9 +43,13 @@ inline T* Tensor::data() {
   check_memory_size();
   bool valid =
       std::is_same<T, void>::value || type_ == DataTypeTrait<T>::DataType();
-  PADDLE_ENFORCE(
-      valid, "Tensor holds the wrong type, it holds %s, but desires to be %s",
-      DataTypeToString(type_), DataTypeToString(DataTypeTrait<T>::DataType()));
+  PADDLE_ENFORCE_EQ(
+      valid, true,
+      platform::errors::InvalidArgument(
+          "Tensor holds the wrong type, it holds %s, but desires to be %s",
+          DataTypeToString(type_),
+          DataTypeToString(DataTypeTrait<T>::DataType())));
+
   return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                               offset_);
 }
@@ -69,9 +73,12 @@ inline T* Tensor::mutable_data(const platform::Place& place,
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
   int rank = src.dims().size();
   PADDLE_ENFORCE_GE(
-      rank, 2,
-      "'ReshapeToMatrix()' is only used for flatten high rank "
-      "tensors to matrixs. Can not be used in reshaping vectors.");
+      rank, 2, platform::errors::InvalidArgument(
+                   "'ReshapeToMatrix()' is only used for flatten high rank "
+                   "tensors to matrixs. The dimensions of Tensor must be "
+                   "greater or equal than 2. "
+                   "But received dimensions of Tensor is %d",
+                   rank));
   if (rank == 2) {
     return src;
   }
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index 84f98d339a29b7d23de6e8be4b069b216be31ab2..cc972dd93d032c19015c86debebc27f7c8c0d155 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -41,7 +41,7 @@ TEST(Tensor, DataAssert) {
     std::string ex_msg = err.what();
     EXPECT_TRUE(ex_msg.find("holder_ should not be null") != std::string::npos);
     EXPECT_TRUE(ex_msg.find("Tensor holds no memory. Call "
-                            "Tensor::mutable_data first.") !=
+                            "Tensor::mutable_data firstly.") !=
                 std::string::npos);
   }
   ASSERT_TRUE(caught);
@@ -157,7 +157,7 @@ TEST(Tensor, ShareDataWith) {
       EXPECT_TRUE(ex_msg.find("holder_ should not be null") !=
                   std::string::npos);
       EXPECT_TRUE(ex_msg.find("Tensor holds no memory. Call "
-                              "Tensor::mutable_data first.") !=
+                              "Tensor::mutable_data firstly.") !=
                   std::string::npos);
     }
     ASSERT_TRUE(caught);
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 6dd13d32e6e25f1657f351ff3a54562435b098f3..fd4b1a54d2b44c5bb99d3aa2e0c776c806fb5c8d 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -363,9 +363,12 @@ if(WITH_MKLDNN)
   inference_analysis_api_test_build(${QUANT_IMG_CLASS_TEST_APP} ${QUANT_IMG_CLASS_TEST_APP_SRC})
 
   # MobileNetV1 FP32 vs. Quant INT8
+  # The FP32 model should already be downloaded for slim Quant unit tests on Linux
   set(QUANT2_MobileNetV1_MODEL_DIR "${QUANT_DATA_DIR}/MobileNetV1_quant2")
   set(QUANT2_INT8_MobileNetV1_MODEL_DIR "${QUANT_DATA_DIR}/MobileNetV1_quant2_int8")
-  download_quant_data(${QUANT2_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf.tar.gz")
+  if(NOT LINUX)
+      download_quant_data(${QUANT2_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf.tar.gz")
+  endif(NOT LINUX)
   download_quant_data(${QUANT2_INT8_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf_int8.tar.gz")
   inference_analysis_api_quant_test_run(test_analyzer_quant_performance_benchmark ${QUANT_IMG_CLASS_TEST_APP} ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float ${QUANT2_INT8_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf_int8 ${IMAGENET_DATA_PATH})
 
diff --git a/paddle/fluid/operators/correlation_op.cc b/paddle/fluid/operators/correlation_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a2e6ff214bfa30af2e25b7feac41d22f02ab75a7
--- /dev/null
+++ b/paddle/fluid/operators/correlation_op.cc
@@ -0,0 +1,181 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+inline std::vector<int64_t> CorrelationOutputSize(int batch, int input_height,
+                                                  int input_width, int stride1,
+                                                  int stride2, int kernel_size,
+                                                  int pad_size,
+                                                  int max_displacement) {
+  std::vector<int64_t> output_shape({batch});
+  int kernel_radius = (kernel_size - 1) / 2;
+  int border_radius = kernel_radius + max_displacement;
+  int padded_input_height = input_height + 2 * pad_size;
+  int padded_input_width = input_width + 2 * pad_size;
+  int output_channel = ((max_displacement / stride2) * 2 + 1) *
+                       ((max_displacement / stride2) * 2 + 1);
+  output_shape.push_back(output_channel);
+  int output_height =
+      std::ceil(static_cast<float>(padded_input_height - 2 * border_radius) /
+                static_cast<float>(stride1));
+  int output_width =
+      std::ceil(static_cast<float>(padded_input_width - 2 * border_radius) /
+                static_cast<float>(stride1));
+  output_shape.push_back(output_height);
+  output_shape.push_back(output_width);
+  return output_shape;
+}
+
+class CorrelationOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Input1", "Input is a 4-D Tensor with shape [N, C, H, W]");
+    AddInput("Input2", "Input is a 4-D Tensor with shape [N, C, H, W]");
+    AddOutput("Output",
+              "(Tensor) The output tensor of correlation operator. "
+              "It has same data fromat and data type as the Input.");
+    AddAttr<int>("pad_size", "pad size for input1 and input2");
+    AddAttr<int>("kernel_size", "kernel size of input1 and input2");
+    AddAttr<int>("max_displacement", "max displacement of input1 and input2");
+    AddAttr<int>("stride1", "Input1 stride");
+    AddAttr<int>("stride2", "Input2 stride");
+    AddAttr<int>("corr_type_multiply", "correlation coefficient").SetDefault(1);
+    AddComment(
+        R"DOC(Correlation of two feature map. Only support NCHW data format.)DOC");
+  }
+};
+
+class CorrelationOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Input1"), "Input", "X", "CorrelationOp");
+    OP_INOUT_CHECK(ctx->HasInput("Input2"), "Input", "Y", "CorrelationOp");
+    int stride1 = ctx->Attrs().Get<int>("stride1");
+    int stride2 = ctx->Attrs().Get<int>("stride2");
+    int max_displacement = ctx->Attrs().Get<int>("max_displacement");
+    int pad_size = ctx->Attrs().Get<int>("pad_size");
+    int kernel_size = ctx->Attrs().Get<int>("kernel_size");
+
+    auto in_dims = ctx->GetInputDim("Input1");
+    auto in2_dims = ctx->GetInputDim("Input2");
+
+    PADDLE_ENFORCE_EQ(in_dims.size() == 4, true,
+                      platform::errors::InvalidArgument(
+                          "Input(X) of CorrelationOp must be 4 dims."
+                          "But received dims is %d.",
+                          in_dims.size()));
+
+    PADDLE_ENFORCE_EQ(in2_dims.size() == 4, true,
+                      platform::errors::InvalidArgument(
+                          "Input(Y) of CorrelationOp must be 4 dims."
+                          "But received dims is %d.",
+                          in2_dims.size()));
+    std::vector<int64_t> output_shape =
+        CorrelationOutputSize(in_dims[0], in_dims[2], in_dims[3], stride1,
+                              stride2, kernel_size, pad_size, max_displacement);
+    ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type =
+        OperatorWithKernel::IndicateVarDataType(ctx, "Input1");
+    PADDLE_ENFORCE_EQ(input_data_type, ctx.Input<Tensor>("Input2")->type(),
+                      platform::errors::InvalidArgument(
+                          "X and Y shoule have the same datatype"));
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+template <typename T>
+class CorrelationOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("correlation_grad");
+    op->SetInput("Input1", this->Input("Input1"));
+    op->SetInput("Input2", this->Input("Input2"));
+    op->SetInput(framework::GradVarName("Output"), this->OutputGrad("Output"));
+    op->SetOutput(framework::GradVarName("Input1"), this->InputGrad("Input1"));
+    op->SetOutput(framework::GradVarName("Input2"), this->InputGrad("Input2"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+class CorrelationOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Input1"), "Input", "X", "CorrelationOp");
+    OP_INOUT_CHECK(ctx->HasInput("Input2"), "Input", "Y", "CorrelationOp");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Output")), "Input",
+                   "Output@GRAD", "CorrelationGradOp");
+
+    auto in1_dims = ctx->GetInputDim("Input1");
+    auto in2_dims = ctx->GetInputDim("Input2");
+    ctx->SetOutputDim(framework::GradVarName("Input1"), in1_dims);
+    ctx->SetOutputDim(framework::GradVarName("Input2"), in2_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "Input1"), ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class CorrelationKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::Unimplemented("Correlation only supports GPU now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(correlation, ops::CorrelationOp, ops::CorrelationOpMaker,
+                  ops::CorrelationOpGradMaker<paddle::framework::OpDesc>,
+                  ops::CorrelationOpGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(correlation_grad, ops::CorrelationOpGrad);
+REGISTER_OP_CPU_KERNEL(correlation, ops::CorrelationKernel<float>,
+                       ops::CorrelationKernel<double>);
diff --git a/paddle/fluid/operators/correlation_op.cu b/paddle/fluid/operators/correlation_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0d177f653ec3d5cacbd4d938e6e5da6689b1bc74
--- /dev/null
+++ b/paddle/fluid/operators/correlation_op.cu
@@ -0,0 +1,483 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+#define THREADS_PER_BLOCK 32
+#define FULL_MASK 0xffffffff
+
+using framework::Tensor;
+using DataLayout = framework::DataLayout;
+
+template <typename T>
+__forceinline__ __device__ T warpReduceSum(T val) {
+  for (int offset = 16; offset > 0; offset /= 2) {
+    val += __shfl_down_sync(FULL_MASK, val, offset);
+  }
+  return val;
+}
+
+template <typename T>
+__forceinline__ __device__ T blockReduceSum(T val) {
+  static __shared__ T shared[32];
+  int lane = threadIdx.x % warpSize;
+  int wid = threadIdx.x / warpSize;
+
+  val = warpReduceSum(val);
+  if (lane == 0) shared[wid] = val;
+
+  __syncthreads();
+  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
+
+  if (wid == 0) val = warpReduceSum(val);
+
+  return val;
+}
+
+template <typename T>
+__global__ void set_zero(T *x, int num) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x)
+    x[i] = static_cast<T>(0);
+}
+
+template <typename T>
+__global__ void channel_first(const T *input, T *rinput, const int channel,
+                              const int height, const int width,
+                              const int pad_size) {
+  int n = blockIdx.x;
+  int h = blockIdx.y;
+  int w = blockIdx.z;
+
+  int ch_off = threadIdx.x;
+  T value;
+  int dimchw = channel * height * width;
+  int dimhw = height * width;
+
+  int p_dimw = (width + 2 * pad_size);
+  int p_dimh = (height + 2 * pad_size);
+  int p_dimchw = channel * p_dimw * p_dimh;
+  int p_dimcw = channel * p_dimw;
+
+  for (int c = ch_off; c < channel; c += THREADS_PER_BLOCK) {
+    value = input[n * dimchw + c * dimhw + h * width + w];
+    rinput[n * p_dimchw + (h + pad_size) * p_dimcw + (w + pad_size) * channel +
+           c] = value;
+  }
+}
+
+template <typename T>
+__global__ void correlation_forward(
+    T *output, const int output_channel, const int output_height,
+    const int output_width, const T *rinput1, const int input_channel,
+    const int input_height, const int input_width, const T *rinput2,
+    const int pad_size, const int kernel_size, const int max_displacement,
+    const int stride1, const int stride2) {
+  int p_input_width = input_width + 2 * pad_size;
+  int p_input_height = input_height + 2 * pad_size;
+
+  int kernel_rad = (kernel_size - 1) / 2;
+  int displacement_rad = max_displacement / stride2;
+
+  int displacement_size = 2 * displacement_rad + 1;
+
+  int n = blockIdx.x;
+  int h1 = blockIdx.y * stride1 + max_displacement;
+  int w1 = blockIdx.z * stride1 + max_displacement;
+  int c = threadIdx.x;
+
+  int p_dimchw = p_input_height * p_input_width * input_channel;
+  int p_dimcw = p_input_width * input_channel;
+  int p_dimc = input_channel;
+
+  int t_dimchw = output_channel * output_height * output_width;
+  int t_dimhw = output_height * output_width;
+  int t_dimw = output_width;
+
+  int nelems = kernel_size * kernel_size * p_dimc;
+
+  for (int tj = -displacement_rad; tj <= displacement_rad; ++tj) {
+    for (int ti = -displacement_rad; ti <= displacement_rad; ++ti) {
+      int w2 = w1 + ti * stride2;
+      int h2 = h1 + tj * stride2;
+
+      T acc0 = 0;
+      for (int j = -kernel_rad; j <= kernel_rad; ++j) {
+        for (int i = -kernel_rad; i <= kernel_rad; ++i) {
+          for (int ch = c; ch < p_dimc; ch += blockDim.x) {
+            int index1 =
+                n * p_dimchw + (h1 + j) * p_dimcw + (w1 + i) * p_dimc + ch;
+            int index2 =
+                n * p_dimchw + (h2 + j) * p_dimcw + (w2 + i) * p_dimc + ch;
+            acc0 += static_cast<T>(rinput1[index1] * rinput2[index2]);
+          }
+        }
+      }
+      if (blockDim.x == warpSize) {
+        __syncwarp();
+        acc0 = warpReduceSum(acc0);
+      } else {
+        __syncthreads();
+        acc0 = blockReduceSum(acc0);
+      }
+
+      if (threadIdx.x == 0) {
+        int tc = (tj + displacement_rad) * displacement_size +
+                 (ti + displacement_rad);
+        const int t_index =
+            n * t_dimchw + tc * t_dimhw + blockIdx.y * t_dimw + blockIdx.z;
+        output[t_index] = static_cast<T>(acc0 / nelems);
+      }
+    }
+  }
+}
+
+// class CorrelationKernel<platform::CUDADeviceContext, T>
+template <typename T>
+class CorrelationCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
+                      platform::errors::InvalidArgument(
+                          "Correlation only supports GPU now."));
+
+    auto *input1 = ctx.Input<Tensor>("Input1");
+    auto *input2 = ctx.Input<Tensor>("Input2");
+    int pad_size = ctx.Attr<int>("pad_size");
+    int kernel_size = ctx.Attr<int>("kernel_size");
+    int stride1 = ctx.Attr<int>("stride1");
+    int stride2 = ctx.Attr<int>("stride2");
+    int max_displacement = ctx.Attr<int>("max_displacement");
+    int corr_type_multiply = ctx.Attr<int>("corr_type_multiply");
+
+    auto *output = ctx.Output<Tensor>("Output");
+    output->mutable_data<T>(ctx.GetPlace());
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    // base on input1, NCHW
+    auto in_dims = input1->dims();
+    int N = in_dims[0];
+    int C = in_dims[1];
+    int H = in_dims[2];
+    int W = in_dims[3];
+
+    int padded_input_height = H + 2 * pad_size;
+    int padded_input_width = W + 2 * pad_size;
+
+    Tensor rinput1 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>(
+        {N, padded_input_height, padded_input_width, C}, dev_ctx);
+    rinput1.mutable_data<T>(ctx.GetPlace());
+
+    Tensor rinput2 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>(
+        {N, padded_input_height, padded_input_width, C}, dev_ctx);
+    rinput2.mutable_data<T>(ctx.GetPlace());
+
+    set_zero<<<(rinput1.numel() + 512 - 1) / 512, 512, 0, dev_ctx.stream()>>>(
+        rinput1.data<T>(), rinput1.numel());
+    set_zero<<<(rinput2.numel() + 512 - 1) / 512, 512, 0, dev_ctx.stream()>>>(
+        rinput2.data<T>(), rinput2.numel());
+    set_zero<<<(output->numel() + 512 - 1) / 512, 512, 0, dev_ctx.stream()>>>(
+        output->data<T>(), output->numel());
+
+    auto out_dims = output->dims();
+    int OC = out_dims[1];
+    int OH = out_dims[2];
+    int OW = out_dims[3];
+
+    dim3 blocks_grid(N, H, W);
+    dim3 threads_block(THREADS_PER_BLOCK);
+
+    channel_first<T><<<blocks_grid, threads_block, 0, dev_ctx.stream()>>>(
+        input1->data<T>(), rinput1.data<T>(), C, H, W, pad_size);
+    channel_first<T><<<blocks_grid, threads_block, 0, dev_ctx.stream()>>>(
+        input2->data<T>(), rinput2.data<T>(), C, H, W, pad_size);
+
+    dim3 threadsPerBlock(THREADS_PER_BLOCK);
+    dim3 totalBlocksCorr(N, OH, OW);
+
+    correlation_forward<
+        T><<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(
+        output->data<T>(), OC, OH, OW, rinput1.data<T>(), C, H, W,
+        rinput2.data<T>(), pad_size, kernel_size, max_displacement, stride1,
+        stride2);
+  }
+};
+
+template <typename T>
+__global__ void correlation_backward_input1(
+    int item, T *grad_input1, const int input_channel, const int input_height,
+    const int input_width, const T *grad_output, const int output_channel,
+    const int output_height, const int output_width, const T *rinput2,
+    const int pad_size, const int kernel_size, const int max_displacement,
+    const int stride1, const int stride2) {
+  int n = item;
+  int h = blockIdx.x * stride1 + pad_size;
+  int w = blockIdx.y * stride1 + pad_size;
+  int c = blockIdx.z;
+  int tch_off = threadIdx.x;
+
+  int kernel_rad = (kernel_size - 1) / 2;
+  int displacement_rad = max_displacement / stride2;
+  int displacement_size = 2 * displacement_rad + 1;
+
+  int xmin = (w - kernel_rad - max_displacement) / stride1;
+  int ymin = (h - kernel_rad - max_displacement) / stride1;
+
+  int xmax = (w + kernel_rad - max_displacement) / stride1;
+  int ymax = (h + kernel_rad - max_displacement) / stride1;
+
+  if (xmax < 0 || ymax < 0 || xmin >= output_width || ymin >= output_height) {
+    return;
+  }
+
+  if (xmin > xmax || ymin > ymax) {
+    return;
+  }
+
+  xmin = max(0, xmin);
+  xmax = min(output_width - 1, xmax);
+
+  ymin = max(0, ymin);
+  ymax = min(output_height - 1, ymax);
+
+  int p_input_width = input_width + 2 * pad_size;
+  int p_input_height = input_height + 2 * pad_size;
+  int p_dimchw = input_channel * p_input_height * p_input_width;
+  int p_dimcw = input_channel * p_input_width;
+  int p_dimc = input_channel;
+
+  int t_dimchw = output_channel * output_height * output_width;
+  int t_dimhw = output_height * output_width;
+  int t_dimw = output_width;
+
+  int o_dimchw = input_channel * input_height * input_width;
+  int o_dimhw = input_height * input_width;
+  int o_dimw = input_width;
+
+  int nelems = kernel_size * kernel_size * input_channel;
+
+  __shared__ T prod_sum[THREADS_PER_BLOCK];
+  prod_sum[tch_off] = 0;
+
+  for (int tc = tch_off; tc < output_channel; tc += THREADS_PER_BLOCK) {
+    int i2 = (tc % displacement_size - displacement_rad) * stride2;
+    int j2 = (tc / displacement_size - displacement_rad) * stride2;
+
+    int index2 = n * p_dimchw + (h + j2) * p_dimcw + (w + i2) * p_dimc + c;
+
+    T val2 = rinput2[index2];
+    for (int j = ymin; j <= ymax; ++j) {
+      for (int i = xmin; i <= xmax; ++i) {
+        int t_index = n * t_dimchw + tc * t_dimhw + j * t_dimw + i;
+        prod_sum[tch_off] += grad_output[t_index] * val2;
+      }
+    }
+  }
+
+  __syncthreads();
+
+  if (tch_off == 0) {
+    T reduce_sum = 0;
+    for (int index = 0; index < THREADS_PER_BLOCK; index++) {
+      reduce_sum += prod_sum[index];
+    }
+    const int index1 =
+        n * o_dimchw + c * o_dimhw + (h - pad_size) * o_dimw + (w - pad_size);
+    grad_input1[index1] = static_cast<T>(reduce_sum / nelems);
+  }
+}
+
+template <typename T>
+__global__ void correlation_backward_input2(
+    int item, T *grad_input2, const int input_channel, const int input_height,
+    const int input_width, const T *grad_output, const int output_channel,
+    const int output_height, const int output_width, const T *rinput1,
+    const int pad_size, const int kernel_size, const int max_displacement,
+    const int stride1, const int stride2) {
+  int n = item;
+  int h = blockIdx.x * stride1 + pad_size;
+  int w = blockIdx.y * stride1 + pad_size;
+  int c = blockIdx.z;
+
+  int tch_off = threadIdx.x;
+
+  int kernel_rad = (kernel_size - 1) / 2;
+  int displacement_rad = max_displacement / stride2;
+  int displacement_size = 2 * displacement_rad + 1;
+
+  int p_input_width = input_width + 2 * pad_size;
+  int p_input_height = input_height + 2 * pad_size;
+  int p_dimchw = input_channel * p_input_height * p_input_width;
+  int p_dimcw = input_channel * p_input_width;
+  int p_dimc = input_channel;
+
+  int t_dimchw = output_channel * output_height * output_width;
+  int t_dimhw = output_height * output_width;
+  int t_dimw = output_width;
+
+  int o_dimchw = input_channel * input_height * input_width;
+  int o_dimhw = input_height * input_width;
+  int o_dimw = input_width;
+
+  int nelems = kernel_size * kernel_size * input_channel;
+
+  __shared__ T prod_sum[THREADS_PER_BLOCK];
+  prod_sum[tch_off] = 0;
+
+  for (int tc = tch_off; tc < output_channel; tc += THREADS_PER_BLOCK) {
+    int i2 = (tc % displacement_size - displacement_rad) * stride2;
+    int j2 = (tc / displacement_size - displacement_rad) * stride2;
+
+    int xmin = (w - kernel_rad - max_displacement - i2) / stride1;
+    int ymin = (h - kernel_rad - max_displacement - j2) / stride1;
+
+    int xmax = (w + kernel_rad - max_displacement - i2) / stride1;
+    int ymax = (h + kernel_rad - max_displacement - j2) / stride1;
+
+    if (xmax < 0 || ymax < 0 || xmin >= output_width || ymin >= output_height) {
+      continue;
+    }
+
+    if (xmin > xmax || ymin > ymax) {
+      continue;
+    }
+
+    xmin = max(0, xmin);
+    xmax = min(output_width - 1, xmax);
+
+    ymin = max(0, ymin);
+    ymax = min(output_height - 1, ymax);
+
+    int index1 = n * p_dimchw + (h - j2) * p_dimcw + (w - i2) * p_dimc + c;
+    T val1 = rinput1[index1];
+    for (int j = ymin; j <= ymax; ++j) {
+      for (int i = xmin; i <= xmax; ++i) {
+        int t_index = n * t_dimchw + tc * t_dimhw + j * t_dimw + i;
+        prod_sum[tch_off] += grad_output[t_index] * val1;
+      }
+    }
+  }
+
+  __syncthreads();
+
+  if (tch_off == 0) {
+    T reduce_sum = 0;
+    for (int index = 0; index < THREADS_PER_BLOCK; index++) {
+      reduce_sum += prod_sum[index];
+    }
+    const int index2 =
+        n * o_dimchw + c * o_dimhw + (h - pad_size) * o_dimw + (w - pad_size);
+    grad_input2[index2] = static_cast<T>(reduce_sum / nelems);
+  }
+}
+
+template <typename T>
+class CorrelationCUDAGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
+                      platform::errors::InvalidArgument(
+                          "Correlation only supports GPU now."));
+    const auto *input1 = ctx.Input<Tensor>("Input1");
+    const auto *input2 = ctx.Input<Tensor>("Input2");
+    const auto *grad_output =
+        ctx.Input<Tensor>(framework::GradVarName("Output"));
+    const int pad_size = ctx.Attr<int>("pad_size");
+    const int kernel_size = ctx.Attr<int>("kernel_size");
+    const int stride1 = ctx.Attr<int>("stride1");
+    const int stride2 = ctx.Attr<int>("stride2");
+    const int max_displacement = ctx.Attr<int>("max_displacement");
+    const int corr_type_multiply = ctx.Attr<int>("corr_type_multiply");
+
+    auto *grad_input1 = ctx.Output<Tensor>(framework::GradVarName("Input1"));
+    grad_input1->mutable_data<T>(ctx.GetPlace());
+    auto *grad_input2 = ctx.Output<Tensor>(framework::GradVarName("Input2"));
+    grad_input2->mutable_data<T>(ctx.GetPlace());
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    auto in_dims = input1->dims();
+    int N = in_dims[0];
+    int C = in_dims[1];
+    int H = in_dims[2];
+    int W = in_dims[3];
+
+    int padded_input_height = H + 2 * pad_size;
+    int padded_input_width = W + 2 * pad_size;
+
+    Tensor rinput1 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>(
+        {N, padded_input_height, padded_input_width, C}, dev_ctx);
+    rinput1.mutable_data<T>(ctx.GetPlace());
+
+    Tensor rinput2 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>(
+        {N, padded_input_height, padded_input_width, C}, dev_ctx);
+    rinput2.mutable_data<T>(ctx.GetPlace());
+
+    set_zero<<<(rinput1.numel() + 512 - 1) / 512, 512, 0, dev_ctx.stream()>>>(
+        rinput1.data<T>(), rinput1.numel());
+    set_zero<<<(rinput2.numel() + 512 - 1) / 512, 512, 0, dev_ctx.stream()>>>(
+        rinput2.data<T>(), rinput2.numel());
+    set_zero<<<(grad_input1->numel() + 512 - 1) / 512, 512, 0,
+               dev_ctx.stream()>>>(grad_input1->data<T>(),
+                                   grad_input1->numel());
+    set_zero<<<(grad_input2->numel() + 512 - 1) / 512, 512, 0,
+               dev_ctx.stream()>>>(grad_input2->data<T>(),
+                                   grad_input2->numel());
+
+    auto grad_out_dims = grad_output->dims();
+    int GOC = grad_out_dims[1];
+    int GOH = grad_out_dims[2];
+    int GOW = grad_out_dims[3];
+
+    dim3 blocks_grid(N, H, W);
+    dim3 threads_block(THREADS_PER_BLOCK);
+
+    channel_first<T><<<blocks_grid, threads_block, 0, dev_ctx.stream()>>>(
+        input1->data<T>(), rinput1.data<T>(), C, H, W, pad_size);
+    channel_first<T><<<blocks_grid, threads_block, 0, dev_ctx.stream()>>>(
+        input2->data<T>(), rinput2.data<T>(), C, H, W, pad_size);
+
+    dim3 threadsPerBlock(THREADS_PER_BLOCK);
+    dim3 totalBlocksCorr(H, W, C);
+
+    for (int n = 0; n < N; n++) {
+      correlation_backward_input1<
+          T><<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(
+          n, grad_input1->data<T>(), C, H, W, grad_output->data<T>(), GOC, GOH,
+          GOW, rinput2.data<T>(), pad_size, kernel_size, max_displacement,
+          stride1, stride2);
+    }
+
+    for (int n = 0; n < N; n++) {
+      correlation_backward_input2<
+          T><<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(
+          n, grad_input2->data<T>(), C, H, W, grad_output->data<T>(), GOC, GOH,
+          GOW, rinput1.data<T>(), pad_size, kernel_size, max_displacement,
+          stride1, stride2);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(correlation, ops::CorrelationCUDAKernel<float>,
+                        ops::CorrelationCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(correlation_grad, ops::CorrelationCUDAGradKernel<float>,
+                        ops::CorrelationCUDAGradKernel<double>);
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
index b3e3332fe3425301a3dede3a3d810697ad4debf3..44f602237da2e2c8fa26e39326f977d10235155d 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License.*/
 
 #include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -54,11 +55,14 @@ class CollectFpnProposalsOp : public framework::OperatorWithKernel {
               score_dim[1]));
     }
     context->SetOutputDim("FpnRois", {post_nms_topN, 4});
+    if (context->HasOutput("RoisNum")) {
+      context->SetOutputDim("RoisNum", {-1});
+    }
     if (!context->IsRuntime()) {  // Runtime LoD infershape will be computed
       // in Kernel.
       context->ShareLoD("MultiLevelRois", "FpnRois");
     }
-    if (context->IsRuntime()) {
+    if (context->IsRuntime() && !context->HasInputs("MultiLevelRoIsNum")) {
       std::vector<framework::InferShapeVarPtr> roi_inputs =
           context->GetInputVarPtrs("MultiLevelRois");
       std::vector<framework::InferShapeVarPtr> score_inputs =
@@ -99,7 +103,16 @@ class CollectFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
              "(LoDTensor) Multiple score LoDTensors from each level in shape"
              " (N, 1), N is the number of RoIs.")
         .AsDuplicable();
+    AddInput(
+        "MultiLevelRoIsNum",
+        "(List of Tensor) The RoIs' number of each image on multiple levels."
+        "The number on each level has the shape of (N), N is the number of "
+        "images.")
+        .AsDuplicable()
+        .AsDispensable();
     AddOutput("FpnRois", "(LoDTensor) All selected RoIs with highest scores");
+    AddOutput("RoisNum", "(Tensor), Number of RoIs in each images.")
+        .AsDispensable();
     AddAttr<int>("post_nms_topN",
                  "Select post_nms_topN RoIs from"
                  " all images and all fpn layers");
@@ -123,3 +136,14 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(collect_fpn_proposals,
                        ops::CollectFpnProposalsOpKernel<float>,
                        ops::CollectFpnProposalsOpKernel<double>);
+REGISTER_OP_VERSION(collect_fpn_proposals)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade collect_fpn_proposals add a new input 
+              [MultiLevelRoIsNum] and add a new output [RoisNum].)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewInput("MultiLevelRoIsNum",
+                      "The RoIs' number of each image on multiple levels."
+                      "The number on each level has the shape of (N), "
+                      "N is the number of images.")
+            .NewOutput("RoisNum", "The number of RoIs in each image."));
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index 35222a85cd388f6fef3c61c440be7b36598d9e01..86207052bb2bef4f7bea34c2614fe7686f579de8 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -80,14 +80,27 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     int lod_size;
     auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
 
+    auto multi_rois_num = ctx.MultiInput<Tensor>("MultiLevelRoIsNum");
     for (size_t i = 0; i < roi_ins.size(); ++i) {
       auto roi_in = roi_ins[i];
       auto score_in = score_ins[i];
-      auto roi_lod = roi_in->lod().back();
-      lod_size = roi_lod.size() - 1;
-      for (size_t n = 0; n < lod_size; ++n) {
-        for (size_t j = roi_lod[n]; j < roi_lod[n + 1]; ++j) {
-          roi_batch_id_data[index++] = n;
+      if (multi_rois_num.size() > 0) {
+        framework::Tensor temp;
+        TensorCopySync(*multi_rois_num[i], platform::CPUPlace(), &temp);
+        const int* length_in = temp.data<int>();
+        lod_size = multi_rois_num[i]->numel();
+        for (size_t n = 0; n < lod_size; ++n) {
+          for (size_t j = 0; j < length_in[n]; ++j) {
+            roi_batch_id_data[index++] = n;
+          }
+        }
+      } else {
+        auto length_in = roi_in->lod().back();
+        lod_size = length_in.size() - 1;
+        for (size_t n = 0; n < lod_size; ++n) {
+          for (size_t j = length_in[n]; j < length_in[n + 1]; ++j) {
+            roi_batch_id_data[index++] = n;
+          }
         }
       }
 
@@ -190,6 +203,13 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
       offset.emplace_back(offset.back() + length_lod_cpu[i]);
     }
 
+    if (ctx.HasOutput("RoisNum")) {
+      auto* rois_num = ctx.Output<Tensor>("RoisNum");
+      int* rois_num_data = rois_num->mutable_data<int>({lod_size}, place);
+      memory::Copy(place, rois_num_data, place, length_lod_data,
+                   lod_size * sizeof(int), dev_ctx.stream());
+    }
+
     framework::LoD lod;
     lod.emplace_back(offset);
     fpn_rois->set_lod(lod);
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
index badd88f0689ba9defcb3f26eb57fef89308aa877..950b8b78933bff6bf1692df61142258dfbc87a8c 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
@@ -17,6 +17,7 @@ limitations under the License.*/
 #include <algorithm>
 #include <cmath>
 #include <cstring>
+#include <numeric>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
@@ -65,6 +66,8 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
 
     auto multi_layer_scores =
         context.MultiInput<paddle::framework::LoDTensor>("MultiLevelScores");
+    auto multi_rois_num = context.MultiInput<Tensor>("MultiLevelRoIsNum");
+    int num_size = multi_rois_num.size();
 
     auto* fpn_rois = context.Output<paddle::framework::LoDTensor>("FpnRois");
 
@@ -88,11 +91,21 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     const int num_fpn_level = multi_layer_rois.size();
     std::vector<int> integral_of_all_rois(num_fpn_level + 1, 0);
     for (int i = 0; i < num_fpn_level; ++i) {
-      auto cur_rois_lod = multi_layer_rois[i]->lod().back();
-      integral_of_all_rois[i + 1] =
-          integral_of_all_rois[i] + cur_rois_lod[cur_rois_lod.size() - 1];
+      int all_rois = 0;
+      if (num_size == 0) {
+        auto cur_rois_lod = multi_layer_rois[i]->lod().back();
+        all_rois = cur_rois_lod[cur_rois_lod.size() - 1];
+      } else {
+        const int* cur_rois_num = multi_rois_num[i]->data<int>();
+        all_rois = std::accumulate(
+            cur_rois_num, cur_rois_num + multi_rois_num[i]->numel(), 0);
+      }
+      integral_of_all_rois[i + 1] = integral_of_all_rois[i] + all_rois;
     }
 
+    const int batch_size = (num_size == 0)
+                               ? multi_layer_rois[0]->lod().back().size() - 1
+                               : multi_rois_num[0]->numel();
     // concatenate all fpn rois scores into a list
     // create a vector to store all scores
     std::vector<ScoreWithID<T>> scores_of_all_rois(
@@ -100,11 +113,20 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     for (int i = 0; i < num_fpn_level; ++i) {
       const T* cur_level_scores = multi_layer_scores[i]->data<T>();
       int cur_level_num = integral_of_all_rois[i + 1] - integral_of_all_rois[i];
-      auto cur_scores_lod = multi_layer_scores[i]->lod().back();
       int cur_batch_id = 0;
+      int pre_num = 0;
       for (int j = 0; j < cur_level_num; ++j) {
-        if (static_cast<size_t>(j) >= cur_scores_lod[cur_batch_id + 1]) {
-          cur_batch_id++;
+        if (num_size == 0) {
+          auto cur_scores_lod = multi_layer_scores[i]->lod().back();
+          if (static_cast<size_t>(j) >= cur_scores_lod[cur_batch_id + 1]) {
+            cur_batch_id++;
+          }
+        } else {
+          const int* rois_num_data = multi_rois_num[i]->data<int>();
+          if (j >= pre_num + rois_num_data[cur_batch_id]) {
+            pre_num += rois_num_data[cur_batch_id];
+            cur_batch_id++;
+          }
         }
         int cur_index = j + integral_of_all_rois[i];
         scores_of_all_rois[cur_index].score = cur_level_scores[j];
@@ -134,6 +156,9 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     T* fpn_rois_data = fpn_rois->data<T>();
     std::vector<size_t> lod0(1, 0);
     int cur_batch_id = 0;
+    std::vector<int64_t> num_per_batch;
+    int pre_idx = 0;
+    int cur_num = 0;
     for (int i = 0; i < post_nms_topN; ++i) {
       int cur_fpn_level = scores_of_all_rois[i].level;
       int cur_level_index = scores_of_all_rois[i].index;
@@ -144,6 +169,18 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
       if (scores_of_all_rois[i].batch_id != cur_batch_id) {
         cur_batch_id = scores_of_all_rois[i].batch_id;
         lod0.emplace_back(i);
+        cur_num = i - pre_idx;
+        pre_idx = i;
+        num_per_batch.emplace_back(cur_num);
+      }
+    }
+    num_per_batch.emplace_back(post_nms_topN - pre_idx);
+    if (context.HasOutput("RoisNum")) {
+      auto* rois_num = context.Output<Tensor>("RoisNum");
+      int* rois_num_data =
+          rois_num->mutable_data<int>({batch_size}, context.GetPlace());
+      for (int i = 0; i < batch_size; i++) {
+        rois_num_data[i] = num_per_batch[i];
       }
     }
     lod0.emplace_back(post_nms_topN);
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
index 160d43a917b3c74ff905e070714415d35c5c877c..614b37e703e721337057e04c5611386ff87a1e9e 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -48,6 +49,14 @@ class DistributeFpnProposalsOp : public framework::OperatorWithKernel {
     }
     ctx->SetOutputsDim("MultiFpnRois", outs_dims);
     ctx->SetOutputDim("RestoreIndex", {-1, 1});
+
+    if (ctx->HasOutputs("MultiLevelRoIsNum")) {
+      std::vector<framework::DDim> outs_num_dims;
+      for (size_t i = 0; i < num_out_rois; ++i) {
+        outs_num_dims.push_back({-1});
+      }
+      ctx->SetOutputsDim("MultiLevelRoIsNum", outs_num_dims);
+    }
     if (!ctx->IsRuntime()) {
       for (size_t i = 0; i < num_out_rois; ++i) {
         ctx->SetLoDLevel("MultiFpnRois", ctx->GetLoDLevel("FpnRois"), i);
@@ -66,12 +75,22 @@ class DistributeFpnProposalsOp : public framework::OperatorWithKernel {
 class DistributeFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("FpnRois", "(LoDTensor) The rois at all levels in shape (-1, 4)");
+    AddInput("FpnRois", "(LoDTensor) The RoIs at all levels in shape (-1, 4)");
+    AddInput("RoisNum",
+             "(Tensor) The number of RoIs in shape (B),"
+             "B is the number of images")
+        .AsDispensable();
     AddOutput("MultiFpnRois", "(LoDTensor) Output with distribute operator")
         .AsDuplicable();
     AddOutput("RestoreIndex",
               "(Tensor) An array of positive number which is "
               "used to restore the order of FpnRois");
+    AddOutput("MultiLevelRoIsNum",
+              "(List of Tensor) The RoIs' number of each image on multiple "
+              "levels. The number on each level has the shape of (B),"
+              "B is the number of images.")
+        .AsDuplicable()
+        .AsDispensable();
     AddAttr<int>("min_level",
                  "The lowest level of FPN layer where the"
                  " proposals come from");
@@ -105,3 +124,14 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(distribute_fpn_proposals,
                        ops::DistributeFpnProposalsOpKernel<float>,
                        ops::DistributeFpnProposalsOpKernel<double>);
+REGISTER_OP_VERSION(distribute_fpn_proposals)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade distribute_fpn_proposals add a new input
+              [RoisNum] and add a new output [MultiLevelRoIsNum].)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewInput("RoIsNum", "The number of RoIs in each image.")
+            .NewOutput("MultiLevelRoisNum",
+                       "The RoIs' number of each image on multiple "
+                       "levels. The number on each level has the shape of (B),"
+                       "B is the number of images."));
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index 1e3cd9f36c595f978f5b5e5f5c5cf5cad6dc9059..27c06a0f8fb207b5dc85c7875ea91428b16e606c 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -76,12 +76,20 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     int num_level = max_level - min_level + 1;
 
     // check that the fpn_rois is not empty
-    PADDLE_ENFORCE_EQ(
-        fpn_rois->lod().size(), 1UL,
-        platform::errors::InvalidArgument("DistributeFpnProposalsOp needs LoD"
-                                          "with one level"));
+    if (!ctx.HasInput("RoisNum")) {
+      PADDLE_ENFORCE_EQ(
+          fpn_rois->lod().size(), 1UL,
+          platform::errors::InvalidArgument("DistributeFpnProposalsOp needs LoD"
+                                            "with one level"));
+    }
 
-    auto fpn_rois_lod = fpn_rois->lod().back();
+    std::vector<size_t> fpn_rois_lod;
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num = ctx.Input<Tensor>("RoisNum");
+      fpn_rois_lod = GetLodFromRoisNum(rois_num);
+    } else {
+      fpn_rois_lod = fpn_rois->lod().back();
+    }
     int lod_size = fpn_rois_lod.size() - 1;
     int roi_num = fpn_rois_lod[lod_size];
 
@@ -154,6 +162,8 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
         restore_idx_data, roi_num);
 
     int start = 0;
+    auto multi_rois_num = ctx.MultiOutput<Tensor>("MultiLevelRoIsNum");
+
     for (int i = 0; i < num_level; ++i) {
       Tensor sub_lod = sub_lod_list.Slice(i, i + 1);
       int* sub_lod_data = sub_lod.data<int>();
@@ -180,6 +190,11 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
         multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
                                            dev_ctx.GetPlace());
       }
+      if (multi_rois_num.size() > 0) {
+        Tensor* rois_num_t = multi_rois_num[i];
+        TensorCopySync(sub_lod, dev_ctx.GetPlace(), rois_num_t);
+        rois_num_t->Resize({lod_size});
+      }
       framework::LoD lod;
       lod.emplace_back(offset);
       multi_fpn_rois[i]->set_lod(lod);
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
index 0c84b385ccbc1dd26453bd957661c0310b7137e3..79498f01536d2fb2616921a2ef1ffa04f13fae64 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
@@ -28,6 +28,21 @@ namespace operators {
 
 const int kBoxDim = 4;
 
+inline std::vector<size_t> GetLodFromRoisNum(const Tensor* rois_num) {
+  std::vector<size_t> rois_lod;
+  auto* rois_num_data = rois_num->data<int>();
+  Tensor cpu_tensor;
+  if (platform::is_gpu_place(rois_num->place())) {
+    TensorCopySync(*rois_num, platform::CPUPlace(), &cpu_tensor);
+    rois_num_data = cpu_tensor.data<int>();
+  }
+  rois_lod.push_back(static_cast<size_t>(0));
+  for (int i = 0; i < rois_num->numel(); ++i) {
+    rois_lod.push_back(rois_lod.back() + static_cast<size_t>(rois_num_data[i]));
+  }
+  return rois_lod;
+}
+
 template <typename T>
 static inline T BBoxArea(const T* box, bool normalized) {
   if (box[2] < box[0] || box[3] < box[1]) {
@@ -65,13 +80,22 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     const int num_level = max_level - min_level + 1;
 
     // check that the fpn_rois is not empty
-    PADDLE_ENFORCE_EQ(
-        fpn_rois->lod().size(), 1UL,
-        platform::errors::InvalidArgument("DistributeFpnProposalsOp needs LoD "
-                                          "with one level."));
+    if (!context.HasInput("RoisNum")) {
+      PADDLE_ENFORCE_EQ(fpn_rois->lod().size(), 1UL,
+                        platform::errors::InvalidArgument(
+                            "DistributeFpnProposalsOp needs LoD "
+                            "with one level."));
+    }
 
-    auto fpn_rois_lod = fpn_rois->lod().back();
-    int fpn_rois_num = fpn_rois_lod[fpn_rois_lod.size() - 1];
+    std::vector<size_t> fpn_rois_lod;
+    int fpn_rois_num;
+    if (context.HasInput("RoisNum")) {
+      auto* rois_num = context.Input<Tensor>("RoisNum");
+      fpn_rois_lod = GetLodFromRoisNum(rois_num);
+    } else {
+      fpn_rois_lod = fpn_rois->lod().back();
+    }
+    fpn_rois_num = fpn_rois_lod[fpn_rois_lod.size() - 1];
     std::vector<int> target_level;
     // std::vector<int> target_level(fpn_rois_num, -1);
     // record the number of rois in each level
@@ -136,6 +160,18 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     for (int i = 0; i < fpn_rois_num; ++i) {
       restore_index_data[restore_index_inter[i]] = i;
     }
+    auto multi_rois_num = context.MultiOutput<Tensor>("MultiLevelRoIsNum");
+    if (multi_rois_num.size() > 0) {
+      int batch_size = fpn_rois_lod.size() - 1;
+      for (int i = 0; i < num_level; ++i) {
+        int* rois_num_data = multi_rois_num[i]->mutable_data<int>(
+            {batch_size}, context.GetPlace());
+        for (int j = 0; j < batch_size; ++j) {
+          rois_num_data[j] = static_cast<int>(multi_fpn_rois_lod0[i][j + 1] -
+                                              multi_fpn_rois_lod0[i][j]);
+        }
+      }
+    }
     // merge lod information into LoDTensor
     for (int i = 0; i < num_level; ++i) {
       framework::LoD lod;
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index 981a368e8564fbcd3d688bc67d2def8664bcfe8d..06e560f86d4e0a74f7ae04b155829618ce634697 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -61,6 +62,10 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
 
     ctx->SetOutputDim("RpnRois", {-1, 4});
     ctx->SetOutputDim("RpnRoiProbs", {-1, 1});
+    if (!ctx->IsRuntime()) {
+      ctx->SetLoDLevel("RpnRois", std::max(ctx->GetLoDLevel("Scores"), 1));
+      ctx->SetLoDLevel("RpnRoiProbs", std::max(ctx->GetLoDLevel("Scores"), 1));
+    }
   }
 
  protected:
@@ -347,7 +352,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     lod0.push_back(0);
     anchors.Resize({anchors.numel() / 4, 4});
     variances.Resize({variances.numel() / 4, 4});
-    std::vector<int64_t> tmp_lod;
+    std::vector<int> tmp_num;
 
     int64_t num_proposals = 0;
     for (int64_t i = 0; i < num; ++i) {
@@ -369,16 +374,16 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
       AppendProposals(rpn_roi_probs, num_proposals, scores);
       num_proposals += proposals.dims()[0];
       lod0.push_back(num_proposals);
-      tmp_lod.push_back(num_proposals);
+      tmp_num.push_back(proposals.dims()[0]);
     }
-    if (context.HasOutput("RpnRoisLod")) {
-      auto *rpn_rois_lod = context.Output<Tensor>("RpnRoisLod");
-      rpn_rois_lod->mutable_data<int64_t>({num}, context.GetPlace());
-      int64_t *lod_data = rpn_rois_lod->data<int64_t>();
+    if (context.HasOutput("RpnRoisNum")) {
+      auto *rpn_rois_num = context.Output<Tensor>("RpnRoisNum");
+      rpn_rois_num->mutable_data<int>({num}, context.GetPlace());
+      int *num_data = rpn_rois_num->data<int>();
       for (int i = 0; i < num; i++) {
-        lod_data[i] = tmp_lod[i];
+        num_data[i] = tmp_num[i];
       }
-      rpn_rois_lod->Resize({num});
+      rpn_rois_num->Resize({num});
     }
     rpn_rois->set_lod(lod);
     rpn_roi_probs->set_lod(lod);
@@ -433,6 +438,16 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
 
     Tensor keep;
     FilterBoxes<T>(ctx, &proposals, min_size, im_info_slice, &keep);
+    // Handle the case when there is no keep index left
+    if (keep.numel() == 0) {
+      math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+      bbox_sel.mutable_data<T>({1, 4}, ctx.GetPlace());
+      set_zero(ctx, &bbox_sel, static_cast<T>(0));
+      Tensor scores_filter;
+      scores_filter.mutable_data<T>({1, 1}, ctx.GetPlace());
+      set_zero(ctx, &scores_filter, static_cast<T>(0));
+      return std::make_pair(bbox_sel, scores_filter);
+    }
 
     Tensor scores_filter;
     bbox_sel.mutable_data<T>({keep.numel(), 4}, ctx.GetPlace());
@@ -481,7 +496,8 @@ class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
               "(LoDTensor), Output proposals with shape (rois_num, 4).");
     AddOutput("RpnRoiProbs",
               "(LoDTensor) Scores of proposals with shape (rois_num, 1).");
-    AddOutput("RpnRoisLod", "(Tensor), rpn rois's lod info").AsDispensable();
+    AddOutput("RpnRoisNum", "(Tensor), The number of Rpn RoIs in each image")
+        .AsDispensable();
     AddAttr<int>("pre_nms_topN",
                  "Number of top scoring RPN proposals to keep before "
                  "applying NMS.");
@@ -515,3 +531,11 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(generate_proposals, ops::GenerateProposalsKernel<float>,
                        ops::GenerateProposalsKernel<double>);
+REGISTER_OP_VERSION(generate_proposals)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade generate_proposals add a new output [RpnRoisNum])ROC",
+        paddle::framework::compatible::OpVersionDesc().NewOutput(
+            "RpnRoisNum",
+            "The number of Rpn RoIs in each image. RpnRoisNum is "
+            "dispensable."));
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index fa7670f6d680a95da1c1abd5befe1651ccb7265f..485136d8e2f7ab66f6b1c58deb09036ea5d4e1ec 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -330,6 +330,15 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   keep_index.Resize({keep_num});
 
   Tensor scores_filter, proposals_filter;
+  // Handle the case when there is no keep index left
+  if (keep_num == 0) {
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    proposals_filter.mutable_data<T>({1, 4}, ctx.GetPlace());
+    scores_filter.mutable_data<T>({1, 1}, ctx.GetPlace());
+    set_zero(ctx, &proposals_filter, static_cast<T>(0));
+    set_zero(ctx, &scores_filter, static_cast<T>(0));
+    return std::make_pair(proposals_filter, scores_filter);
+  }
   proposals_filter.mutable_data<T>({keep_num, 4}, ctx.GetPlace());
   scores_filter.mutable_data<T>({keep_num, 1}, ctx.GetPlace());
   GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
@@ -421,7 +430,7 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
 
     int64_t num_proposals = 0;
     std::vector<size_t> offset(1, 0);
-    std::vector<int64_t> tmp_lod;
+    std::vector<int> tmp_num;
 
     for (int64_t i = 0; i < num; ++i) {
       Tensor im_info_slice = im_info->Slice(i, i + 1);
@@ -448,15 +457,15 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
       dev_ctx.Wait();
       num_proposals += proposals.dims()[0];
       offset.emplace_back(num_proposals);
-      tmp_lod.push_back(num_proposals);
+      tmp_num.push_back(proposals.dims()[0]);
     }
-    if (context.HasOutput("RpnRoisLod")) {
-      auto *rpn_rois_lod = context.Output<Tensor>("RpnRoisLod");
-      rpn_rois_lod->mutable_data<int64_t>({num}, context.GetPlace());
-      int64_t *lod_data = rpn_rois_lod->data<int64_t>();
-      memory::Copy(place, lod_data, cpu_place, &tmp_lod[0],
-                   sizeof(int64_t) * num, dev_ctx.stream());
-      rpn_rois_lod->Resize({num});
+    if (context.HasOutput("RpnRoisNum")) {
+      auto *rpn_rois_num = context.Output<Tensor>("RpnRoisNum");
+      rpn_rois_num->mutable_data<int>({num}, context.GetPlace());
+      int *num_data = rpn_rois_num->data<int>();
+      memory::Copy(place, num_data, cpu_place, &tmp_num[0], sizeof(int) * num,
+                   dev_ctx.stream());
+      rpn_rois_num->Resize({num});
     }
     framework::LoD lod;
     lod.emplace_back(offset);
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index deb71b807128e5c0b173b517e60832894ced41e5..f5224239eb2ded9a156aadc9185eca89f4e3396f 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -115,7 +115,7 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<std::string>(
         "padding_mode",
         "(bool, default true) The padding method used when source"
-        "index is out of input images. It can be 'zeros', 'reflect' and "
+        "index is out of input images. It can be 'zeros', 'reflection' and "
         "'border'.")
         .SetDefault("zeros");
 
@@ -174,6 +174,10 @@ class GridSampleOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
   void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
+                   framework::GradVarName("X"), "grid_sampler");
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Grid")), "Output",
+                   framework::GradVarName("Grid"), "grid_sampler");
     auto input_dims = ctx->GetInputDim("X");
     auto grid_dims = ctx->GetInputDim("Grid");
     if (ctx->HasOutput(framework::GradVarName("X"))) {
diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu
index 999f990448ca6370dadacbdaee5bf3bcadcaca0e..4e61d0c2ea7f91e4199c3e9daa3e93ac45bc0eb8 100644
--- a/paddle/fluid/operators/grid_sampler_op.cu
+++ b/paddle/fluid/operators/grid_sampler_op.cu
@@ -268,7 +268,7 @@ class GridSampleOpCUDAKernel : public framework::OpKernel<T> {
     Mode mode;
     if (padding_mode_s == "border") {
       padding_mode = PaddingMode::border;
-    } else if (padding_mode_s == "reflect") {
+    } else if (padding_mode_s == "reflection") {
       padding_mode = PaddingMode::reflect;
     } else {
       padding_mode = PaddingMode::zeros;
@@ -432,7 +432,7 @@ class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> {
     Mode mode;
     if (padding_mode_s == "border") {
       padding_mode = PaddingMode::border;
-    } else if (padding_mode_s == "reflect") {
+    } else if (padding_mode_s == "reflection") {
       padding_mode = PaddingMode::reflect;
     } else {
       padding_mode = PaddingMode::zeros;
diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h
index eda800e78faf5da2bb379b8101e4823c5bc2d2f8..b8faef759ae90e14d1e83b66130bfe957b51907b 100644
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
@@ -76,7 +76,7 @@ static inline void clip(const platform::CPUDeviceContext& ctx,
   if (padding_mode == "border") {
     grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
                                      .cwiseMin(static_cast<T>(max_val));
-  } else if (padding_mode == "reflect") {
+  } else if (padding_mode == "reflection") {
     if (align_corners) {
       auto double_range = static_cast<T>(max_val * 2);
       auto grid_abs = grid_slice_t.abs();
@@ -117,7 +117,7 @@ static inline void clipWithMask(const platform::CPUDeviceContext& ctx,
     auto in_bound = (res == grid_slice_t);
     grid_scale_t.device(place) = grid_scale_t * in_bound.template cast<T>();
     grid_slice_t.device(place) = res;
-  } else if (padding_mode == "reflect") {
+  } else if (padding_mode == "reflection") {
     if (align_corners) {
       auto double_range = static_cast<T>(max_val * 2);
       auto is_neg = (grid_slice_t < static_cast<T>(0));
diff --git a/paddle/fluid/operators/kldiv_loss_op.h b/paddle/fluid/operators/kldiv_loss_op.h
index 369fdb4872b4184d706a5264b58f70f63051fca1..857ecda303c2607b1b6fb9a5d2ec132b335d6c29 100644
--- a/paddle/fluid/operators/kldiv_loss_op.h
+++ b/paddle/fluid/operators/kldiv_loss_op.h
@@ -72,7 +72,11 @@ class KLDivLossKernel : public framework::OpKernel<T> {
       loss_t.device(place) = output;
     } else if ("batchmean" == reduction) {
       auto output_sum = output.sum();
-      loss_t.device(place) = output_sum / output_sum.constant(n);
+      if (n > 0) {
+        loss_t.device(place) = output_sum / output_sum.constant(n);
+      } else {
+        loss_t.device(place) = output_sum;
+      }
     } else if ("mean" == reduction) {
       loss_t.device(place) = output.mean();
     } else if ("sum" == reduction) {
diff --git a/paddle/fluid/operators/math/sequence2batch.cc b/paddle/fluid/operators/math/sequence2batch.cc
index e4ffeedb5a0061dd60ca3a30aa9928ef8b05887c..300a3692012ab9631d7049d2042e91fb99ad3c21 100644
--- a/paddle/fluid/operators/math/sequence2batch.cc
+++ b/paddle/fluid/operators/math/sequence2batch.cc
@@ -29,11 +29,24 @@ class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
     auto src_dims = src.dims();
     auto dst_dims = dst->dims();
     PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
-                      "The src must be matrix with rank 2.");
+                      platform::errors::InvalidArgument(
+                          "The source tensor must be a matrix with rank 2, but "
+                          "got the source tensor rank is %lu. "
+                          "Please check the rank of the source tensor",
+                          src_dims.size()));
     PADDLE_ENFORCE_EQ(dst_dims.size(), 2UL,
-                      "The dst must be matrix with rank 2.");
-    PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1],
-                      "The width of src and dst must be same.");
+                      platform::errors::InvalidArgument(
+                          "The destination tensor must be a matrix with rank, "
+                          "but got the destination tensor rank is %lu. "
+                          "Please check the rank of the destination tensor",
+                          dst_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        src_dims[1], dst_dims[1],
+        platform::errors::InvalidArgument(
+            "The width of the source tensor and the destination tensor must be "
+            "same. But got %lu != %lu.Please check the rank of the source "
+            "tensor",
+            src_dims.size(), dst_dims.size()));
     auto height = dst_dims[0];
     auto width = dst_dims[1];
     auto* src_data = src.data<T>();
diff --git a/paddle/fluid/operators/math/sequence2batch.cu b/paddle/fluid/operators/math/sequence2batch.cu
index 9ab13659c1cc5b59d28395bcebcfb43fac5b4544..cd1ca572689bc701da801384e5ed08fe6dc10749 100644
--- a/paddle/fluid/operators/math/sequence2batch.cu
+++ b/paddle/fluid/operators/math/sequence2batch.cu
@@ -46,11 +46,24 @@ class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
     auto src_dims = src.dims();
     auto dst_dims = dst->dims();
     PADDLE_ENFORCE_EQ(src_dims.size(), 2,
-                      "The src must be matrix with rank 2.");
+                      platform::errors::InvalidArgument(
+                          "The source tensor must be a matrix with rank 2, but "
+                          "got the source tensor rank is %lu. "
+                          "Please check the rank of the source tensor",
+                          src_dims.size()));
     PADDLE_ENFORCE_EQ(dst_dims.size(), 2,
-                      "The dst must be matrix with rank 2.");
-    PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1],
-                      "The width of src and dst must be same.");
+                      platform::errors::InvalidArgument(
+                          "The destination tensor must be a matrix with rank, "
+                          "but got the destination tensor rank is %lu. "
+                          "Please check the rank of the destination tensor",
+                          dst_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        src_dims[1], dst_dims[1],
+        platform::errors::InvalidArgument(
+            "The width of the source tensor and the destination tensor must be "
+            "same. But got %lu != %lu.Please check the rank of the source "
+            "tensor",
+            src_dims.size(), dst_dims.size()));
     auto height = dst_dims[0];
     auto width = dst_dims[1];
     auto* src_data = src.data<T>();
diff --git a/paddle/fluid/operators/math/sequence2batch.h b/paddle/fluid/operators/math/sequence2batch.h
index 9d9f7ef00b8a12088225fd3620cb30b43ef9dce9..6aa513e4d10eef49c02417e98b31cddd57088d7c 100644
--- a/paddle/fluid/operators/math/sequence2batch.h
+++ b/paddle/fluid/operators/math/sequence2batch.h
@@ -64,19 +64,30 @@ class LoDTensor2BatchFunctor {
                   bool is_reverse = false) const {
     if (!is_cal_batch_lod) {
       auto lods = batch->lod();
-      PADDLE_ENFORCE_GT(lods.size(), 2UL,
-                        "The LoD of LoDTensor should inlcude at least 2-level "
-                        "sequence information.");
+      PADDLE_ENFORCE_GT(
+          lods.size(), 2UL,
+          platform::errors::InvalidArgument(
+              "The LoD of LoDTensor should inlcude at least 2-level "
+              "sequence information, but got the LoD level is %lu. Please "
+              "check the input value.",
+              lods.size()));
       PADDLE_ENFORCE_EQ(
           lods[1].size(), static_cast<size_t>(lod_tensor.dims()[0]),
-          "The LoD information should be consistent with the dims.");
+          platform::errors::InvalidArgument(
+              "The LoD information should be consistent with the dims, but got "
+              "%lu != %lu. Please check the input value.",
+              lods[1].size(), static_cast<size_t>(lod_tensor.dims()[0])));
       CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
       to_batch(context, lod_tensor, lods[1], batch, true);
       return;
     }
 
     auto lods = lod_tensor.lod();
-    PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(lods.size(), 1UL,
+                      platform::errors::InvalidArgument(
+                          "Only support one level sequence now, but got the "
+                          "LoD level is %lu. Please check the input value.",
+                          lods.size()));
 
     const auto& lod = lods[0];
 
@@ -161,12 +172,19 @@ class Batch2LoDTensorFunctor {
                   const framework::LoDTensor& batch,
                   framework::LoDTensor* lod_tensor) const {
     auto in_lod = batch.lod();
-    PADDLE_ENFORCE_GT(in_lod.size(), 2UL,
-                      "The LoD of LoDTensor should inlcude at least 2-level "
-                      "sequence information.");
+    PADDLE_ENFORCE_GT(
+        in_lod.size(), 2UL,
+        platform::errors::InvalidArgument(
+            "The LoD of LoDTensor should inlcude at least 2-level "
+            "sequence information, but got the LoD level is %lu. Please check "
+            "the input value.",
+            in_lod.size()));
     PADDLE_ENFORCE_EQ(
         in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0]),
-        "The LoD information should be consistent with the dims.");
+        platform::errors::InvalidArgument(
+            "The LoD information should be consistent with the dims, but got "
+            "%lu != %lu. Please check the input value.",
+            in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0])));
     CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
     to_seq(context, batch, in_lod[1], lod_tensor, false);
   }
diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc
index 4630689dec160da145e607f662a802444ac98b55..076df0176429c7bbd350698af0137fbcca18f806 100644
--- a/paddle/fluid/operators/math/sequence_padding.cc
+++ b/paddle/fluid/operators/math/sequence_padding.cc
@@ -35,7 +35,11 @@ void CopyValidData(framework::Tensor* dst_tensor,
     int valid_seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
     PADDLE_ENFORCE_GE(
         pad_seq_len, valid_seq_len,
-        "The padded sequence length can not be less than its original length.");
+        platform::errors::InvalidArgument(
+            "The padded sequence length can not "
+            "be less than its original length. Expected %ld >= %ld, but got "
+            "%ld < %ld. Please check input value.",
+            pad_seq_len, valid_seq_len, pad_seq_len, valid_seq_len));
     int seq_data_offset = seq_offsets[seq_idx] * step_width;
     int pad_data_offset = layout == kBatchLengthWidth
                               ? seq_idx * pad_seq_len * step_width
@@ -95,9 +99,14 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
 
     CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
               step_width, layout);
-    PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width,
-                   "The numel of 'pad_value' can only be 1 or be equal to the "
-                   "'step_width'.");
+
+    PADDLE_ENFORCE_EQ(
+        pad_value.numel() == 1 || pad_value.numel() == step_width, true,
+        platform::errors::InvalidArgument(
+            "The numel of 'pad_value' can only be 1 or be equal to the "
+            "'step_width', but got %ld != 1 and %ld. Please check the input "
+            "value.",
+            pad_value.numel(), step_width));
 
     // fill padding value
     T* pad_data = pad_tensor->data<T>();
diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu
index 1b433067900af71bb8a6833cef019d41f9c76858..19c3af03411b8ce95d274532707c7ee3e93f1d55 100644
--- a/paddle/fluid/operators/math/sequence_padding.cu
+++ b/paddle/fluid/operators/math/sequence_padding.cu
@@ -66,17 +66,25 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
     if (pad_seq_len == -1) {
       pad_seq_len = max_seq_len;
     }
-    PADDLE_ENFORCE_GE(pad_seq_len, max_seq_len,
-                      "The pad_seq_len must be equal to or greater than the "
-                      "original max sequence length.");
+    PADDLE_ENFORCE_GE(
+        pad_seq_len, max_seq_len,
+        platform::errors::InvalidArgument(
+            "The pad_seq_len must be equal to or greater than the "
+            "original max sequence length. Expected %ld >= %ld, but got %ld < "
+            "%ld. Please check the input value.",
+            pad_seq_len, max_seq_len, pad_seq_len, max_seq_len));
     int step_width = seq_tensor.numel() / seq_tensor_dims[0];
     int seq_num = seq_offsets.size() - 1;
 
     CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
               step_width, layout);
-    PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width,
-                   "The numel of 'pad_value' can only be 1 or be equal to the "
-                   "'step_width'.");
+    PADDLE_ENFORCE_EQ(
+        pad_value.numel() == 1 || pad_value.numel() == step_width, true,
+        platform::errors::InvalidArgument(
+            "The numel of 'pad_value' can only be 1 or be equal to "
+            "the 'step_width', but got %ld != 1 and %ld. Please check the "
+            "input value.",
+            pad_value.numel(), step_width));
 
     const int kBlockSize = 512;
 
diff --git a/paddle/fluid/operators/math/sequence_padding.h b/paddle/fluid/operators/math/sequence_padding.h
index 5580ee5374658c3b7b8e31962cd50f1d72113ba0..956a4ff6a2d45cb619183f9beba1b7e35b7f229c 100644
--- a/paddle/fluid/operators/math/sequence_padding.h
+++ b/paddle/fluid/operators/math/sequence_padding.h
@@ -52,14 +52,25 @@ inline static void CheckDims(const framework::DDim& seq_tensor_dims,
                              const framework::Vector<size_t>& seq_offset,
                              int64_t padded_seq_len, int64_t step_width,
                              const PadLayout& layout) {
-  PADDLE_ENFORCE_EQ(static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back(),
-                    "Value of 1st dimension of the sequence tensor should be "
-                    "equal to sum of lengths of all sequences.");
+  PADDLE_ENFORCE_EQ(
+      static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back(),
+      platform::errors::InvalidArgument(
+          "Value of 1st dimension of the sequence tensor should be "
+          "equal to sum of lengths of all sequences. Expected %ld == %ld, but "
+          "got %ld != %ld. Please check the input value.",
+          static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back(),
+          static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back()));
 
-  PADDLE_ENFORCE(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
-                     seq_tensor_dims.size() == pad_tensor_dims.size(),
-                 "pad_tensor's rank should be 1 greater than seq_tensor's "
-                 "rank, or be equal with it.");
+  PADDLE_ENFORCE_EQ(
+      seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
+          seq_tensor_dims.size() == pad_tensor_dims.size(),
+      true, platform::errors::InvalidArgument(
+                "pad_tensor's rank should be 1 greater than seq_tensor's "
+                "rank, or be equal with it. The pad_tensor's rank is %ld, "
+                "expected the seq_tensor's rank is %ld or %ld, but got %ld. "
+                "Please check the input value.",
+                pad_tensor_dims.size(), pad_tensor_dims.size(),
+                pad_tensor_dims.size() - 1, seq_tensor_dims.size()));
 }
 
 /*
diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc
index cc3fbd587668b17b7edde50b157adca83e81eddc..2eee4d0a6c14e8b6134b71294745c71302450347 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -42,15 +42,29 @@ class MaxSeqPoolFunctor {
     auto out_dims = output->dims();
     auto idx_dims = index->dims();
     PADDLE_ENFORCE_GT(in_dims.size(), 1,
-                      "The rank of input shall be greater than 1.");
+                      platform::errors::InvalidArgument(
+                          "The rank of input shall be greater than 1, but got "
+                          "the rank is %ld. Please check the input value",
+                          in_dims.size()));
     PADDLE_ENFORCE_GT(out_dims.size(), 1,
-                      "The rank of output shall be greater than 1.");
+                      platform::errors::InvalidArgument(
+                          "The rank of output shall be greater than 1, but got "
+                          "the rank is %ld. Please check the input value",
+                          out_dims.size()));
     for (int64_t i = 1; i < in_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i],
-                        "The dimension of input and output shall be same.");
+      PADDLE_ENFORCE_EQ(
+          in_dims[i], out_dims[i],
+          platform::errors::InvalidArgument(
+              "The dimension of input and output shall be same. Expected %ld "
+              "== %ld, but got %ld != %ld. Please check the input value.",
+              in_dims[i], out_dims[i], in_dims[i], out_dims[i]));
     }
-    PADDLE_ENFORCE_EQ(idx_dims, out_dims,
-                      "The dimension of index and output shall be same.");
+    PADDLE_ENFORCE_EQ(
+        idx_dims, out_dims,
+        platform::errors::InvalidArgument(
+            "The dimension of index and output shall be same. Expected %ld == "
+            "%ld, but got %ld != %ld. Please check the input value.",
+            idx_dims, out_dims, idx_dims, out_dims));
 
     auto lod_level = input.lod().size();
     auto starts = input.lod()[lod_level - 1];
@@ -94,12 +108,22 @@ class MaxSeqPoolFunctor<T, true> {
     auto in_dims = input.dims();
     auto out_dims = output->dims();
     PADDLE_ENFORCE_GT(in_dims.size(), 1,
-                      "The rank of input shall be greater than 1.");
+                      platform::errors::InvalidArgument(
+                          "The rank of input shall be greater than 1, but got "
+                          "%ld <= 1. Please check the input value.",
+                          in_dims.size()));
     PADDLE_ENFORCE_GT(out_dims.size(), 1,
-                      "The rank of output shall be greater than 1.");
+                      platform::errors::InvalidArgument(
+                          "The rank of output shall be greater than 1, but got "
+                          "%ld <= 1. Please check the input value.",
+                          out_dims.size()));
     for (int64_t i = 1; i < in_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i],
-                        "The dimension of input and output shall be same.");
+      PADDLE_ENFORCE_EQ(
+          in_dims[i], out_dims[i],
+          platform::errors::InvalidArgument(
+              "The dimension of input and output shall be same. Expected %ld "
+              "== %ld, but got %ld != %ld. Please check the input value.",
+              in_dims[i], out_dims[i], in_dims[i], out_dims[i]));
     }
 
     auto lod_level = input.lod().size();
@@ -139,16 +163,29 @@ class MaxSeqPoolGradFunctor {
     auto ig_dims = in_grad->dims();
     auto idx_dims = index.dims();
     PADDLE_ENFORCE_GT(og_dims.size(), 1,
-                      "The rank of output@Grad shall be greater than 1.");
+                      platform::errors::InvalidArgument(
+                          "The rank of output@Grad shall be greater than 1, "
+                          "but got %ld <= 1. Please check the input value.",
+                          og_dims.size()));
     PADDLE_ENFORCE_GT(ig_dims.size(), 1,
-                      "The rank of input@Grad shall be greater than 1.");
+                      platform::errors::InvalidArgument(
+                          "The rank of input@Grad shall be greater than 1, but "
+                          "got %ld <= 1. Please check the input value.",
+                          ig_dims.size()));
     for (int64_t i = 1; i < og_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(
-          og_dims[i], ig_dims[i],
-          "The dimension of input@Grad and output@Grad shall be same.");
+      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i],
+                        platform::errors::InvalidArgument(
+                            "The dimension of input@Grad and output@Grad shall "
+                            "be same. Expected %ld == %ld, but got %ld != %ld. "
+                            "Please check the input value.",
+                            og_dims[i], ig_dims[i], og_dims[i], ig_dims[i]));
     }
-    PADDLE_ENFORCE_EQ(idx_dims, og_dims,
-                      "The dimension of index and output@Grad shall be same.");
+    PADDLE_ENFORCE_EQ(
+        idx_dims, og_dims,
+        platform::errors::InvalidArgument(
+            "The dimension of index and output@Grad shall be same. Expected "
+            "%ld == %ld, but got %ld != %ld. Please check the input value.",
+            idx_dims, og_dims, idx_dims, og_dims));
 
     const T* og_data = out_grad.data<T>();
     const int* max_index = index.data<int>();
@@ -244,9 +281,12 @@ class SumSeqPoolGradFunctor {
     auto lod = in_grad->lod()[lod_level - 1];
     int64_t out_w = out_grad.numel() / out_grad.dims()[0];
     int64_t in_w = in_grad->numel() / in_grad->dims()[0];
-    PADDLE_ENFORCE_EQ(
-        in_w, out_w,
-        "The feature size of input@Grad and output@Grad shall be same.");
+    PADDLE_ENFORCE_EQ(in_w, out_w,
+                      platform::errors::InvalidArgument(
+                          "The feature size of input@Grad and output@Grad "
+                          "shall be same. Expected %ld == %ld, but got %ld != "
+                          "%ld. Please check the input value.",
+                          in_w, out_w, in_w, out_w));
     const T* out_g_data = out_grad.data<T>();
     T* in_g_data = in_grad->mutable_data<T>(context.GetPlace());
     auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
@@ -298,7 +338,8 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
       auto place = context.GetPlace();
       PADDLE_ENFORCE_EQ(
           platform::is_cpu_place(place), true,
-          "Sequence_pool should run on CPU Device when pooltype is SUM");
+          platform::errors::InvalidArgument(
+              "Sequence_pool should run on CPU Device when pooltype is SUM"));
       const T* src = input.data<T>();
       T* dst = output->mutable_data<T>(place);
       jit::seq_pool_attr_t attr(
@@ -342,7 +383,10 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
         out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
                               std::sqrt(static_cast<T>(h));
       } else {
-        PADDLE_THROW("unsupported pooling pooltype");
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "unsupported pooling pooltype: %s. Only support \"AVERAGE\" and "
+            "\"SQRT\"",
+            pooltype));
       }
     }
   }
@@ -400,7 +444,10 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
       } else if (pooltype == "FIRST") {
         in_g_e.chip(0, 0).device(place) = out_g_e_v;
       } else {
-        PADDLE_THROW("unsupported pooling pooltype");
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "unsupported pooling pooltype: %s. Only support \"AVERAGE\", "
+            "\"SQRT\", \"LAST\" and \"FIRST\"",
+            pooltype));
       }
     }
   }
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
index 422b06c70eb2107659666edf58223ae8e4666b1d..cba8dd935ef1b3625f5f68578e411aec81eaa4f4 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -205,7 +205,10 @@ class SequencePoolFunctor<platform::CUDADeviceContext, T> {
           lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
           output->mutable_data<T>(context.GetPlace()), nullptr);
     } else {
-      PADDLE_THROW("unsupported pooling pooltype");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "unsupported pooling pooltype: %s. Only support \"MAX\", "
+          "\"AVERAGE\", \"SUM\", \"SQRT\", \"LAST\" and \"FIRST\"",
+          pooltype));
     }
   }
 };
@@ -370,7 +373,10 @@ class SequencePoolGradFunctor<platform::CUDADeviceContext, T> {
           in_grad->mutable_data<T>(context.GetPlace()), nullptr);
 
     } else {
-      PADDLE_THROW("unsupported pooling pooltype");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "unsupported pooling pooltype: %s. Only support \"MAX\", "
+          "\"AVERAGE\", \"SUM\", \"SQRT\", \"LAST\" and \"FIRST\"",
+          pooltype));
     }
   }
 };
diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc
index efab1a375b56bea3caec2c8169dc390298a37cbe..4b5f484e52c6acb2d7fb2cea6265c8dd7826571b 100644
--- a/paddle/fluid/operators/math/sequence_pooling_test.cc
+++ b/paddle/fluid/operators/math/sequence_pooling_test.cc
@@ -50,9 +50,21 @@ void TestSequencePoolingSum(const DeviceContext &context,
   in_grad.mutable_data<T>(in_dims, place);
 
   // check tensor contruction result
-  PADDLE_ENFORCE_EQ(in_grad.dims().size(), out_grad.dims().size());
+  PADDLE_ENFORCE_EQ(
+      in_grad.dims().size(), out_grad.dims().size(),
+      paddle::platform::errors::InvalidArgument(
+          "The dimension of input and output shall be same. Expected %ld == "
+          "%ld, but got %ld != %ld. Please check the input value.",
+          in_grad.dims().size(), out_grad.dims().size(), in_grad.dims().size(),
+          out_grad.dims().size()));
   for (int64_t i = 1; i < out_grad.dims().size(); ++i) {
-    PADDLE_ENFORCE_EQ(in_grad.dims()[i], out_grad.dims()[i]);
+    PADDLE_ENFORCE_EQ(
+        in_grad.dims()[i], out_grad.dims()[i],
+        paddle::platform::errors::InvalidArgument(
+            "The dimension of input and output shall be same. Expected %ld == "
+            "%ld, but got %ld != %ld. Please check the input value.",
+            in_grad.dims()[i], out_grad.dims()[i], in_grad.dims()[i],
+            out_grad.dims()[i]));
   }
 
   // call functor
diff --git a/paddle/fluid/operators/math/tree2col.cc b/paddle/fluid/operators/math/tree2col.cc
index cafcf6319326cc9a496de8ee6aa1033e1320f4b0..0344226ea66e2a64fce0574cb45a8c2ce918359c 100644
--- a/paddle/fluid/operators/math/tree2col.cc
+++ b/paddle/fluid/operators/math/tree2col.cc
@@ -55,7 +55,11 @@ void Tree2ColUtil::construct_tree(const paddle::Tensor &EdgeSet,
                                   std::vector<std::vector<int>> *tr,
                                   size_t *node_count) {
   auto edge_set_dims = EdgeSet.dims();
-  PADDLE_ENFORCE_EQ(edge_set_dims[1], 2);
+  PADDLE_ENFORCE_EQ(edge_set_dims[1], 2,
+                    platform::errors::InvalidArgument(
+                        "The second dimension of the EdgeSet shall be 2, but "
+                        "got %ld != 2. Please check the input value.",
+                        edge_set_dims[1]));
   int64_t edge_count = EdgeSet.numel();
 
   const int *edge_data = EdgeSet.data<int>();
diff --git a/paddle/fluid/operators/math/unpooling.cc b/paddle/fluid/operators/math/unpooling.cc
index 13f0845bb8579615381c06072eb1e32507ffd3cf..9ad2ec5005203d02032aaa6f7e48b27f52631059 100644
--- a/paddle/fluid/operators/math/unpooling.cc
+++ b/paddle/fluid/operators/math/unpooling.cc
@@ -37,7 +37,13 @@ class Unpool2dMaxFunctor<platform::CPUDeviceContext, T> {
       for (int c = 0; c < output_channels; ++c) {
         for (int i = 0; i < input_feasize; ++i) {
           int index = indices_data[i];
-          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          PADDLE_ENFORCE_LT(
+              index, output_feasize,
+              platform::errors::InvalidArgument(
+                  "index should less than output tensor height * output tensor "
+                  "width. Expected %ld < %ld, but got "
+                  "%ld >= %ld. Please check input value.",
+                  index, output_feasize, index, output_feasize));
           output_data[index] = input_data[i];
         }
         input_data += input_feasize;
@@ -72,7 +78,13 @@ class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, T> {
       for (int c = 0; c < output_channels; ++c) {
         for (int i = 0; i < input_feasize; ++i) {
           int index = indices_data[i];
-          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          PADDLE_ENFORCE_LT(
+              index, output_feasize,
+              platform::errors::InvalidArgument(
+                  "index should less than output tensor height * output tensor "
+                  "width. Expected %ld < %ld, but got "
+                  "%ld >= %ld. Please check input value.",
+                  index, output_feasize, index, output_feasize));
           input_grad_data[i] = output_grad_data[index];
         }
         input_grad_data += input_feasize;
diff --git a/paddle/fluid/operators/p_norm_op.cc b/paddle/fluid/operators/p_norm_op.cc
index 59035d5a8ca5d4214f1370e1b14b2be9b234fa6a..cd7a8c6d24eaaca096b630b74dd7cd1bb9d35d09 100644
--- a/paddle/fluid/operators/p_norm_op.cc
+++ b/paddle/fluid/operators/p_norm_op.cc
@@ -105,6 +105,12 @@ class PnormOp : public framework::OperatorWithKernel {
     bool asvector = ctx->Attrs().Get<bool>("asvector");
     if (asvector) {
       reduce_dims.emplace_back(1);
+      if (keepdim) {
+        for (int i = 1; i < x_dim.size(); ++i) {
+          reduce_dims.emplace_back(1);
+        }
+        x_dim = framework::make_ddim(reduce_dims);
+      }
     } else {
       if (axis < 0) axis = x_dim.size() + axis;
       for (int i = 0; i < x_dim.size(); ++i) {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index 6e470e3af4e5860796d898a5e3138c28958264cb..54818470b277443e411ea5f7d9c7561eddc7046a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -51,6 +51,20 @@ class ReduceSumOpGradMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
+template <typename T>
+class ReduceSumDoubleOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    op->SetAttrMap(this->Attrs());
+    op->SetType("reduce_sum");
+  }
+};
+
 DECLARE_NO_NEED_BUFFER_VARS_INFERER(ReduceSumGradNoNeedBufferVarInferer, "X");
 class ReduceSumVarTypeInference : public paddle::framework::VarTypeInference {
  public:
@@ -77,6 +91,8 @@ REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ReduceSumOpMaker,
                   ops::ReduceSumOpGradMaker<paddle::framework::OpDesc>,
                   ops::ReduceSumOpGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp,
+                  ops::ReduceSumDoubleOpGradMaker<paddle::framework::OpDesc>,
+                  ops::ReduceSumDoubleOpGradMaker<paddle::imperative::OpBase>,
                   ops::ReduceSumGradNoNeedBufferVarInferer);
 
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
index 911dfea50e2e2cf8ec8f230bfc1e0bf4836463b6..0eeb7e0bb24f512aa6859e92de9f490e491543aa 100644
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/roi_align_op.h"
 #include <memory>
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -35,13 +36,13 @@ class ROIAlignOp : public framework::OperatorWithKernel {
     auto input_dims = ctx->GetInputDim("X");
     auto rois_dims = ctx->GetInputDim("ROIs");
 
-    if (ctx->HasInput("RoisLod")) {
-      auto rois_lod_dims = ctx->GetInputDim("RoisLod");
+    if (ctx->HasInput("RoisNum")) {
+      auto rois_num_dims = ctx->GetInputDim("RoisNum");
       PADDLE_ENFORCE_EQ(
-          rois_lod_dims.size(), 1,
-          platform::errors::InvalidArgument("The RoisLod dimension should be 1"
-                                            ", but got dimension = %d",
-                                            rois_lod_dims.size()));
+          rois_num_dims.size(), 1,
+          platform::errors::InvalidArgument("The size of RoisNum should be 1"
+                                            ", but received size = %d",
+                                            rois_num_dims.size()));
     }
     PADDLE_ENFORCE_EQ(
         input_dims.size(), 4,
@@ -145,9 +146,9 @@ class ROIAlignOpMaker : public framework::OpProtoAndCheckerMaker {
              "given as [[x1, y1, x2, y2], ...]. "
              "(x1, y1) is the top left coordinates, and "
              "(x2, y2) is the bottom right coordinates.");
-    AddInput("RoisLod",
+    AddInput("RoisNum",
              "(Tensor), "
-             "The lod info of rois.")
+             "The number of RoIs in each image.")
         .AsDispensable();
     AddOutput("Out",
               "(Tensor), "
@@ -203,7 +204,7 @@ class ROIAlignGradMaker : public framework::SingleGradOpMaker<T> {
     op->SetType("roi_align_grad");
     op->SetInput("X", this->Input("X"));
     op->SetInput("ROIs", this->Input("ROIs"));
-    op->SetInput("RoisLod", this->Input("RoisLod"));
+    op->SetInput("RoisNum", this->Input("RoisNum"));
     op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
     op->SetAttrMap(this->Attrs());
@@ -231,3 +232,10 @@ REGISTER_OP_CPU_KERNEL(
     ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, float>,
     ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, double>,
     ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, int>);
+REGISTER_OP_VERSION(roi_align)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade roi_align add a new input [RoisNum])ROC",
+        paddle::framework::compatible::OpVersionDesc().NewInput(
+            "RoisNum",
+            "The number of RoIs in each image. RoisNum is dispensable."));
diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu
index f7ec13e5bccd63d2f6552ed52f8d709a57320ddd..3a4ce55f4fb77160e7fc645539c1868fe2864b19 100644
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
@@ -257,24 +257,26 @@ class GPUROIAlignOpKernel : public framework::OpKernel<T> {
     int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
     auto& dev_ctx = ctx.cuda_device_context();
     auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-    if (ctx.HasInput("RoisLod")) {
-      auto* rois_lod = ctx.Input<Tensor>("RoisLod");
-      int rois_batch_size = rois_lod->numel();
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
+      int rois_batch_size = rois_num_t->numel();
       PADDLE_ENFORCE_EQ(
-          rois_batch_size - 1, batch_size,
+          rois_batch_size, batch_size,
           platform::errors::InvalidArgument(
               "The rois_batch_size and imgs "
               "batch_size must be the same. But received rois_batch_size = %d, "
               "batch_size = %d",
               rois_batch_size, batch_size));
 
-      std::vector<int64_t> rois_lod_(rois_batch_size);
-      memory::Copy(cplace, rois_lod_.data(), gplace, rois_lod->data<int64_t>(),
-                   sizeof(int64_t) * rois_batch_size, 0);
-      for (int n = 0; n < rois_batch_size - 1; ++n) {
-        for (size_t i = rois_lod_[n]; i < rois_lod_[n + 1]; ++i) {
+      std::vector<int> rois_num_list(rois_batch_size);
+      memory::Copy(cplace, rois_num_list.data(), gplace,
+                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_list[n]; ++i) {
           roi_batch_id_data[i] = n;
         }
+        start += rois_num_list[n];
       }
     } else {
       auto lod = rois->lod();
@@ -348,16 +350,18 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx = ctx.cuda_device_context();
     auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-    if (ctx.HasInput("RoisLod")) {
-      auto* rois_lod = ctx.Input<Tensor>("RoisLod");
-      int rois_batch_size = rois_lod->numel();
-      std::vector<int64_t> rois_lod_(rois_batch_size);
-      memory::Copy(cplace, rois_lod_.data(), gplace, rois_lod->data<int64_t>(),
-                   sizeof(int64_t) * rois_batch_size, 0);
-      for (int n = 0; n < rois_batch_size - 1; ++n) {
-        for (size_t i = rois_lod_[n]; i < rois_lod_[n + 1]; ++i) {
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
+      int rois_batch_size = rois_num_t->numel();
+      std::vector<int> rois_num_list(rois_batch_size);
+      memory::Copy(cplace, rois_num_list.data(), gplace,
+                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = start; i < start + rois_num_list[n]; ++i) {
           roi_batch_id_data[i] = n;
         }
+        start += rois_num_list[n];
       }
     } else {
       auto rois_lod = rois->lod().back();
diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h
index 366f865411461c91b4ec88203390b15fdba4414c..066125a92fbd9d1d49f0ba023366865620674e1f 100644
--- a/paddle/fluid/operators/roi_align_op.h
+++ b/paddle/fluid/operators/roi_align_op.h
@@ -165,21 +165,23 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
     int* roi_batch_id_data =
         roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
     int rois_batch_size;
-    if (ctx.HasInput("RoisLod")) {
-      auto* rois_lod_t = ctx.Input<framework::Tensor>("RoisLod");
-      rois_batch_size = rois_lod_t->numel();
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
+      rois_batch_size = rois_num_t->numel();
       PADDLE_ENFORCE_EQ(
-          rois_batch_size - 1, batch_size,
+          rois_batch_size, batch_size,
           platform::errors::InvalidArgument(
               "The batch size of rois and the batch size of images "
               " must be the same. But received the batch size of rois is %d, "
               "and the batch size of images is %d",
               rois_batch_size, batch_size));
-      auto* rois_lod = rois_lod_t->data<int64_t>();
-      for (int n = 0; n < rois_batch_size - 1; ++n) {
-        for (int i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+      auto* rois_num_data = rois_num_t->data<int>();
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_data[n]; ++i) {
           roi_batch_id_data[i] = n;
         }
+        start += rois_num_data[n];
       }
     } else {
       auto lod = rois->lod();
@@ -303,14 +305,16 @@ class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
         roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
 
     int rois_batch_size;
-    if (ctx.HasInput("RoisLod")) {
-      auto* rois_lod_t = ctx.Input<framework::Tensor>("RoisLod");
-      rois_batch_size = rois_lod_t->numel();
-      auto* rois_lod = rois_lod_t->data<int64_t>();
-      for (int n = 0; n < rois_batch_size - 1; ++n) {
-        for (int i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
+      rois_batch_size = rois_num_t->numel();
+      auto* rois_num_data = rois_num_t->data<int>();
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_data[n]; ++i) {
           roi_batch_id_data[i] = n;
         }
+        start += rois_num_data[n];
       }
     } else {
       auto rois_lod = rois->lod().back();
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index 8a34cb35f6bf8dde97d29c02749d8a52fdf5f090..be3187b7513144f583458f3d7902a102e531a981 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/roi_pool_op.h"
 #include <memory>
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -34,12 +35,13 @@ class ROIPoolOp : public framework::OperatorWithKernel {
     auto input_dims = ctx->GetInputDim("X");
     auto rois_dims = ctx->GetInputDim("ROIs");
 
-    if (ctx->HasInput("RoisLod")) {
-      auto rois_lod_dims = ctx->GetInputDim("RoisLod");
-      PADDLE_ENFORCE_EQ(rois_lod_dims.size(), 1,
+    if (ctx->HasInput("RoisNum")) {
+      auto rois_num_dims = ctx->GetInputDim("RoisNum");
+      PADDLE_ENFORCE_EQ(rois_num_dims.size(), 1,
                         platform::errors::InvalidArgument(
-                            "The lod information tensor of ROIs should "
-                            "be one-dimensional"));
+                            "The second dimension of RoisNum should "
+                            "be 1, but received dimension is %d",
+                            rois_num_dims.size()));
     }
     PADDLE_ENFORCE_EQ(input_dims.size(), 4,
                       platform::errors::InvalidArgument(
@@ -140,7 +142,8 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
              "Where batch_id is the id of the data, "
              "(x1, y1) is the top left coordinates, and "
              "(x2, y2) is the bottom right coordinates.");
-    AddInput("RoisLod", "(Tensor), The lod info of rois.").AsDispensable();
+    AddInput("RoisNum", "(Tensor), The number of RoIs in each image.")
+        .AsDispensable();
     AddOutput("Out",
               "(Tensor), "
               "The output of ROIPoolOp is a 4-D tensor with shape "
@@ -197,7 +200,7 @@ class ROIPoolGradMaker : public framework::SingleGradOpMaker<T> {
     op->SetType("roi_pool_grad");
     op->SetInput("X", this->Input("X"));
     op->SetInput("ROIs", this->Input("ROIs"));
-    op->SetInput("RoisLod", this->Input("RoisLod"));
+    op->SetInput("RoisNum", this->Input("RoisNum"));
     op->SetInput("Argmax", this->Output("Argmax"));
     op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
@@ -223,3 +226,10 @@ REGISTER_OP_CPU_KERNEL(
     ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
     ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>,
     ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, int>);
+REGISTER_OP_VERSION(roi_pool)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade roi_pool add a new input [RoisNum])ROC",
+        paddle::framework::compatible::OpVersionDesc().NewInput(
+            "RoisNum",
+            "The number of RoIs in each image. RoisNum is dispensable."));
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
index 1e8a8e3037d84f980d963d359ce791b1ddba47d3..98d9ef6b6e11440d38abbedbfd93f6d3544d77bc 100644
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -157,19 +157,21 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
     int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
     auto& dev_ctx = ctx.cuda_device_context();
     auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-    if (ctx.HasInput("RoisLod")) {
-      auto* rois_lod = ctx.Input<Tensor>("RoisLod");
-      int rois_batch_size = rois_lod->numel();
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
+      int rois_batch_size = rois_num_t->numel();
       PADDLE_ENFORCE_EQ(
-          rois_batch_size - 1, batch_size,
+          rois_batch_size, batch_size,
           "The rois_batch_size and imgs batch_size must be the same.");
-      std::vector<int64_t> rois_lod_(rois_batch_size);
-      memory::Copy(cplace, rois_lod_.data(), gplace, rois_lod->data<int64_t>(),
-                   sizeof(int64_t) * rois_batch_size, 0);
-      for (int n = 0; n < rois_batch_size - 1; ++n) {
-        for (size_t i = rois_lod_[n]; i < rois_lod_[n + 1]; ++i) {
+      std::vector<int> rois_num_list(rois_batch_size);
+      memory::Copy(cplace, rois_num_list.data(), gplace,
+                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_list[n]; ++i) {
           roi_batch_id_data[i] = n;
         }
+        start += rois_num_list[n];
       }
     } else {
       auto rois_lod = rois->lod().back();
@@ -206,7 +208,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in = ctx.Input<Tensor>("X");
     auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* rois_lod = ctx.Input<Tensor>("RoisLod");
+    auto* rois_lod = ctx.Input<Tensor>("RoisNum");
     auto* argmax = ctx.Input<Tensor>("Argmax");
 
     auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
@@ -229,17 +231,18 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
 
       auto& dev_ctx = ctx.cuda_device_context();
       auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-      if (ctx.HasInput("RoisLod")) {
-        auto* rois_lod = ctx.Input<Tensor>("RoisLod");
-        int rois_batch_size = rois_lod->numel();
-        std::vector<int64_t> rois_lod_(rois_batch_size);
-        memory::Copy(cplace, rois_lod_.data(), gplace,
-                     rois_lod->data<int64_t>(),
-                     sizeof(int64_t) * rois_batch_size, 0);
-        for (int n = 0; n < rois_batch_size - 1; ++n) {
-          for (size_t i = rois_lod_[n]; i < rois_lod_[n + 1]; ++i) {
+      if (ctx.HasInput("RoisNum")) {
+        auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
+        int rois_batch_size = rois_num_t->numel();
+        std::vector<int> rois_num_list(rois_batch_size);
+        memory::Copy(cplace, rois_num_list.data(), gplace,
+                     rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
+        int start = 0;
+        for (int n = 0; n < rois_batch_size; ++n) {
+          for (int i = start; i < start + rois_num_list[n]; ++i) {
             roi_batch_id_data[i] = n;
           }
+          start += rois_num_list[n];
         }
       } else {
         auto rois_lod = rois->lod().back();
diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h
index 145b170dedf0613328223526b0a40a3c064f3028..40de6d0cf6abbcc4a1505cb6eb121ca70813c780 100644
--- a/paddle/fluid/operators/roi_pool_op.h
+++ b/paddle/fluid/operators/roi_pool_op.h
@@ -58,18 +58,20 @@ class CPUROIPoolOpKernel : public framework::OpKernel<T> {
         roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
 
     int rois_batch_size;
-    if (ctx.HasInput("RoisLod")) {
-      auto* rois_lod_t = ctx.Input<framework::Tensor>("RoisLod");
-      rois_batch_size = rois_lod_t->numel();
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
+      rois_batch_size = rois_num_t->numel();
       PADDLE_ENFORCE_EQ(
-          rois_batch_size - 1, batch_size,
+          rois_batch_size, batch_size,
           platform::errors::InvalidArgument("The rois_batch_size and imgs "
                                             "batch_size must be the same."));
-      auto* rois_lod = rois_lod_t->data<int64_t>();
-      for (int n = 0; n < rois_batch_size - 1; ++n) {
-        for (int i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+      auto* rois_num_data = rois_num_t->data<int>();
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_data[n]; ++i) {
           roi_batch_id_data[i] = n;
         }
+        start += rois_num_data[n];
       }
     } else {
       auto rois_lod = rois->lod().back();
@@ -185,14 +187,16 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
           roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
 
       int rois_batch_size;
-      if (ctx.HasInput("RoisLod")) {
-        auto* rois_lod_t = ctx.Input<framework::Tensor>("RoisLod");
-        rois_batch_size = rois_lod_t->numel();
-        auto* rois_lod = rois_lod_t->data<int64_t>();
-        for (int n = 0; n < rois_batch_size - 1; ++n) {
-          for (int i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+      if (ctx.HasInput("RoisNum")) {
+        auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
+        rois_batch_size = rois_num_t->numel();
+        auto* rois_num_data = rois_num_t->data<int>();
+        int start = 0;
+        for (int n = 0; n < rois_batch_size; ++n) {
+          for (int i = start; i < start + rois_num_data[n]; ++i) {
             roi_batch_id_data[i] = n;
           }
+          start += rois_num_data[n];
         }
       } else {
         auto rois_lod = rois->lod().back();
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 256faf04ea6de5835f22113537caac49ca1dbab4..178ecaff7e8d2e575cd64927fe4e39c773b2cb99 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -43,6 +43,11 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"nll_loss", {"X", "Label", "Weight"}},
     {"bilinear_tensor_product", {"X", "Y", "Weight", "Bias"}},
     {"gather", {"X", "Index", "Axis"}},
+    {"roi_pool", {"X", "ROIs", "RoisNum"}},
+    {"roi_align", {"X", "ROIs", "RoisNum"}},
+    {"collect_fpn_proposals",
+     {"MultiLevelRois", "MultiLevelScores", "MultiLevelRoIsNum"}},
+    {"distribute_fpn_proposals", {"FpnRois", "RoisNum"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
@@ -63,6 +68,10 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
      {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
       "ReserveSpace"}},
     {"unique", {"Out", "Index", "Indices", "Counts"}},
+    {"generate_proposals", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
+    {"collect_fpn_proposals", {"FpnRois", "RoisNum"}},
+    {"distribute_fpn_proposals",
+     {"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}},
 };
 
 // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 11932ce728889efde2c6cf5ecd8b8252e1192c19..1616e237092037303073236809a16a72651568cd 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -111,6 +111,8 @@ goto:success
 :CASE_wincheck_openblas
 set WITH_MKL=OFF
 set WITH_GPU=ON
+rem Temporarily turn off WITH_INFERENCE_API_TEST on GPU due to compile hang
+set WITH_INFERENCE_API_TEST=OFF
 call :cmake || goto cmake_error
 call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
@@ -242,7 +244,7 @@ dir %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin
 dir %THIRD_PARTY_PATH:/=\%\install\warpctc\bin
 
 set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;%THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;%THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH%
-ctest.exe --output-on-failure -C Release -j 8 --repeat until-pass:4
+ctest.exe --output-on-failure -C Release -j 8 --repeat until-pass:4 after-timeout:4
 goto:eof
 
 :unit_test_error
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
index f394a792e3a5750b13da65f50d84e3a0f516f617..7dca7b9cb88a37fb7954c3e8059a7ae09e2ef3a6 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
@@ -13,6 +13,10 @@
 
 from paddle import fluid
 from .meta_optimizer_base import MetaOptimizerBase
+from paddle.fluid import core
+import subprocess
+import re
+import platform
 
 
 class ParameterServerOptimizer(MetaOptimizerBase):
@@ -28,6 +32,9 @@ class ParameterServerOptimizer(MetaOptimizerBase):
     def _can_apply(self):
         if self.role_maker._is_collective:
             return False
+        if self.user_defined_strategy.auto == True:
+            return True
+
         k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
         return True if k_steps >= 0 else False
 
@@ -127,6 +134,105 @@ class ParameterServerOptimizer(MetaOptimizerBase):
 
         return _main, _startup
 
+    def _try_auto_apply_geo(self, program, compiled_config):
+        def get_sys_free_mem():
+            plat = platform.system()
+            if platform.system() == "Darwin":
+                vm = subprocess.Popen(
+                    ['vm_stat'], stdout=subprocess.PIPE).communicate()[0]
+                # Process vm_stat
+                vmLines = vm.split('\n')
+                sep = re.compile(':[\s]+')
+                vmStats = {}
+                for row in range(1, len(vmLines) - 2):
+                    rowText = vmLines[row].strip()
+                    rowElements = sep.split(rowText)
+                    vmStats[(rowElements[0]
+                             )] = int(rowElements[1].strip('\.')) * 4096
+                return vmStats["Pages free"]
+            elif platform.system() == "Linux":
+                mems = {}
+                with open('/proc/meminfo', 'rb') as f:
+                    for line in f:
+                        fields = line.split()
+                        mems[fields[0]] = int(fields[1]) * 1024
+                free = mems[b'MemFree:']
+                return free
+            else:
+                raise ValueError(
+                    "%s platform is unsupported is parameter server optimizer" %
+                    (platform.system()))
+
+        if self.user_defined_strategy.auto == False:
+            return
+
+        a_sync_configs = self.user_defined_strategy.a_sync_configs
+        if a_sync_configs["k_steps"] >= 0:
+            return
+
+        self.user_defined_strategy.a_sync = True
+        if not isinstance(self.inner_opt, fluid.optimizer.SGDOptimizer):
+            # auto async
+            a_sync_configs["k_steps"] = 0
+            self.user_defined_strategy.a_sync_configs = a_sync_configs
+            return
+
+        from paddle.fluid.incubate.fleet.parameter_server.ir.vars_metatools import dtype_to_size
+        free = get_sys_free_mem()
+
+        param_grad_pairs = compiled_config.origin_sparse_pairs + compiled_config.origin_dense_pairs
+        processed_var_names = set(["@EMPTY@"])
+
+        param_memory_size = 0
+        for param_grad_pair in param_grad_pairs:
+            param, grad = param_grad_pair
+            param_memory_size += param.m_size
+            processed_var_names.add(param.name)
+
+        upper_mem_use = param_memory_size * 5.0
+
+        program_tmp_vars = dict()
+        batch_size = 1024
+        for op in program.global_block().ops:
+            for var_name in op.output_arg_names:
+                if var_name in processed_var_names:
+                    continue
+                processed_var_names.add(var_name)
+                var = program.global_block().vars[var_name]
+
+                if var.desc.type() != core.VarDesc.VarType.LOD_TENSOR:
+                    continue
+
+                data_count = 1
+                neg_dim_count = 0
+                for x in var.shape:
+                    if x < 0:
+                        if neg_dim_count >= 1:
+                            raise ValueError(
+                                "Var %s has more than one negative dim." %
+                                (var_name))
+                        neg_dim_count += 1
+                        data_count *= (-x)
+                    else:
+                        data_count *= x
+                program_tmp_vars[var_name] = (data_count, neg_dim_count,
+                                              dtype_to_size[var.dtype])
+
+        for varname in program_tmp_vars:
+            data_count, neg_dim_count, type_size = program_tmp_vars[varname]
+            if neg_dim_count == 1:
+                data_count *= batch_size
+            var_memory = data_count * type_size
+            upper_mem_use += var_memory
+
+        if upper_mem_use < free:
+            # auto geo
+            a_sync_configs["k_steps"] = 800
+        else:
+            # auto async
+            a_sync_configs["k_steps"] = 0
+        self.user_defined_strategy.a_sync_configs = a_sync_configs
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
@@ -134,7 +240,6 @@ class ParameterServerOptimizer(MetaOptimizerBase):
                       no_grad_set=None):
         self.inner_opt.minimize(loss, startup_program, parameter_list,
                                 no_grad_set)
-        strategy = self._get_distributed_strategy()
 
         _origin_main_program = loss.block.program
         _origin_startup_program = startup_program
@@ -142,7 +247,12 @@ class ParameterServerOptimizer(MetaOptimizerBase):
 
         compiled_config = public.CompileTimeStrategy(_origin_main_program,
                                                      _origin_startup_program,
-                                                     strategy, self.role_maker)
+                                                     None, self.role_maker)
+
+        self._try_auto_apply_geo(_origin_main_program, compiled_config)
+
+        strategy = self._get_distributed_strategy()
+        compiled_config.strategy = strategy
 
         if self.role_maker.is_worker() or self.role_maker._is_heter_worker():
             main_program, startup_program = self._build_trainer_programs(
diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index 0e187d4174cd5cca65f79e4ab84b4cc32ecefd21..7b564b3f837c001673bdd272ba60edf31cde21fb 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -37,7 +37,7 @@ import warnings
 import inspect
 
 import numpy as np
-
+import paddle
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.layers import utils
 from ... import unique_name
@@ -56,7 +56,8 @@ __all__ = [
     'match_matrix_tensor', 'tree_conv', 'fused_embedding_seq_pool',
     'multiclass_nms2', 'search_pyramid_hash', 'shuffle_batch', 'partial_concat',
     'sparse_embedding', 'partial_sum', 'tdm_child', 'rank_attention',
-    'tdm_sampler', 'batch_fc', '_pull_box_extended_sparse', 'bilateral_slice'
+    'tdm_sampler', 'batch_fc', '_pull_box_extended_sparse', 'bilateral_slice',
+    'correlation'
 ]
 
 
@@ -1546,3 +1547,81 @@ def bilateral_slice(x, guide, grid, has_offset, name=None):
         attrs={'has_offset': has_offset},
         outputs={'Out': out})
     return out
+
+
+def correlation(x,
+                y,
+                pad_size,
+                kernel_size,
+                max_displacement,
+                stride1,
+                stride2,
+                corr_type_multiply=1):
+    """
+
+    This operation compute correlation of two tensor.
+    For more information of correlation, please refer to PWC-Net: 
+    CNNs for Optical Flow Using Pyramid, Warping, and Cost Volume 
+    <https://arxiv.org/pdf/1709.02371.pdf>_
+
+    Args:
+        x(Tensor): The input x is 4-D Tensor with shape [N, C, H, W]. The data type is float32 and float64.
+        y(Tensor): The input y is 4-D Tensor with shape [N, C, H, W]. The data type is float32 and float64.
+        pad_size(int): Pad size. The data type is int.
+        max_displacement(int): Max displacement. The data type is int.
+        stride1(int): stride size of x. The data type is int.
+        stride2(int): stride size of y. The data type is int.
+        corr_type_multiply(int, optional): The type of multiply. The data type is int. Default: 1.
+
+    Returns:
+        Tensor: The data type is same as input tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+
+            x1 = fluid.layers.data(name='x1',
+                               shape=x_shape,
+                               dtype=x_type,
+                               append_batch_size=False)
+            x2 = fluid.layers.data(name='x2',
+                                shape=x_shape,
+                                dtype=x_type,
+                                append_batch_size=False)
+
+
+            out = fluid.contrib.correlation(
+                            x1,
+                            x2,
+                            pad_size=4,
+                            kernel_size=1,
+                            max_displacement=4,
+                            stride1=1,
+                            stride2=1)
+
+    """
+
+    helper = LayerHelper("correlation", **locals())
+    output = helper.create_variable_for_type_inference(dtype=x.dtype)
+    if paddle.fluid.in_dygraph_mode():
+        attrs = ("pad_size", pad_size, "kernel_size", kernel_size,
+                 "max_displacement", max_displacement, "stride1", stride1,
+                 "stride2", stride2, "corr_type_multiply", corr_type_multiply)
+        output = getattr(core.ops, "correlation")(x, y, *attrs)
+    else:
+        helper.append_op(
+            type="correlation",
+            inputs={"Input1": x,
+                    "Input2": y},
+            attrs={
+                "pad_size": pad_size,
+                "kernel_size": kernel_size,
+                "max_displacement": max_displacement,
+                "stride1": stride1,
+                "stride2": stride2,
+                "corr_type_multiply": corr_type_multiply
+            },
+            outputs={"Output": output})
+    return output
diff --git a/python/paddle/fluid/contrib/tests/test_correlation.py b/python/paddle/fluid/contrib/tests/test_correlation.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fcef4dbcd1efd3655b6339ed5ec880d8cd33fc0
--- /dev/null
+++ b/python/paddle/fluid/contrib/tests/test_correlation.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.base import to_variable
+
+
+def corr(x_1,
+         x_2,
+         pad_size=4,
+         kernel_size=1,
+         max_displacement=4,
+         stride1=1,
+         stride2=1,
+         corr_multiply=1):
+    K = kernel_size
+
+    rinput1 = np.pad(x_1, ((0, 0), (0, 0), (pad_size, pad_size),
+                           (pad_size, pad_size)),
+                     mode='constant')
+    rinput2 = np.pad(x_2, ((0, 0), (0, 0), (pad_size, pad_size),
+                           (pad_size, pad_size)),
+                     mode='constant')
+    rinput1 = np.transpose(rinput1, (0, 2, 3, 1))
+    rinput2 = np.transpose(rinput2, (0, 2, 3, 1))
+    B = int(rinput1.shape[0])
+    H = int(x_1.shape[2])
+    W = int(x_2.shape[3])
+    d = max_displacement
+    D = 2 * d + 1
+    output = np.zeros((B, D * D, H, W), dtype=np.float32)
+
+    for b in range(B):
+        for i in range(H):
+            for j in range(W):
+                for k in range(-d, d + 1):
+                    for l in range(-d, d + 1):
+                        x1_index = i + pad_size
+                        y1_index = j + pad_size
+                        x2_index = x1_index + k
+                        y2_index = y1_index + l
+                        output[b, l + d + D * (k + d), i, j] = np.mean(
+                            rinput1[b, x1_index:x1_index + K, y1_index:y1_index
+                                    + K] * rinput2[b, x2_index:x2_index + K,
+                                                   y2_index:y2_index + K])
+
+    return output
+
+
+class TestCorrelationOp(unittest.TestCase):
+    def test_check_output(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+        np.random.seed(13)
+        np.set_printoptions(threshold=np.inf)
+        x_shape = (2, 10, 3, 3)
+        x_type = 'float32'
+        x1 = fluid.layers.data(
+            name='x1',
+            shape=x_shape,
+            dtype=x_type,
+            append_batch_size=False,
+            stop_gradient=False)
+        x2 = fluid.layers.data(
+            name='x2',
+            shape=x_shape,
+            dtype=x_type,
+            append_batch_size=False,
+            stop_gradient=False)
+
+        x1_np = np.random.randn(2, 3, 4, 5).astype(x_type)
+        x2_np = np.random.randn(2, 3, 4, 5).astype(x_type)
+        out_np = corr(
+            x1_np,
+            x2_np,
+            pad_size=4,
+            kernel_size=1,
+            max_displacement=4,
+            stride1=1,
+            stride2=1)
+
+        out = fluid.contrib.correlation(
+            x1,
+            x2,
+            pad_size=4,
+            kernel_size=1,
+            max_displacement=4,
+            stride1=1,
+            stride2=1)
+
+        loss = fluid.layers.reduce_mean(out)
+        optimizer = fluid.optimizer.Momentum(0.0001, 0.9)
+        optimizer.minimize(loss)
+
+        place = fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        res = exe.run(feed={'x1': x1_np,
+                            'x2': x2_np},
+                      fetch_list=[out.name, loss.name])
+
+        self.assertTrue(np.allclose(res[0], out_np))
+
+
+class Net(fluid.dygraph.Layer):
+    def __init__(self, name_scope):
+        super(Net, self).__init__(name_scope)
+
+    def forward(self, x1, x2):
+        y = fluid.contrib.correlation(
+            x1,
+            x2,
+            pad_size=4,
+            kernel_size=1,
+            max_displacement=4,
+            stride1=1,
+            stride2=1)
+        return y
+
+
+class TestCorrelationOpDyGraph(unittest.TestCase):
+    def test_check_output(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+        np.random.seed(13)
+        np.set_printoptions(threshold=np.inf)
+        x_shape = (2, 10, 3, 3)
+        x_type = 'float32'
+        place = fluid.CUDAPlace(0)
+        with fluid.dygraph.guard(place):
+            x1_np = np.random.randn(2, 3, 4, 5).astype(x_type)
+            x2_np = np.random.randn(2, 3, 4, 5).astype(x_type)
+            out_np = corr(
+                x1_np,
+                x2_np,
+                pad_size=4,
+                kernel_size=1,
+                max_displacement=4,
+                stride1=1,
+                stride2=1)
+
+            x1 = to_variable(x1_np)
+            x2 = to_variable(x2_np)
+            corr_pd = Net('corr_pd')
+            y = corr_pd(x1, x2)
+            out = y.numpy()
+            self.assertTrue(np.allclose(out, out_np))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
index 90e38bd98863ff62174bd569a483b11984480b5a..37ce8b0a152ff8e258e8aee2a54ed7215f77c146 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
@@ -135,6 +135,11 @@ class FunctionSpec(object):
 
             input_with_spec = pack_sequence_as(args, input_with_spec)
 
+        # If without specificing name in input_spec, add default name
+        # according to argument name from decorated function.
+        input_with_spec = replace_spec_empty_name(self._arg_names,
+                                                  input_with_spec)
+
         return input_with_spec
 
     @switch_to_static_graph
@@ -309,3 +314,61 @@ def convert_to_input_spec(inputs, input_spec):
         raise TypeError(
             "The type(input_spec) should be a `InputSpec` or dict/list/tuple of it, but received {}.".
             type_name(input_spec))
+
+
+def replace_spec_empty_name(args_name, input_with_spec):
+    """
+    Adds default name according to argument name from decorated function
+    if without specificing InputSpec.name
+
+    The naming rule are as followed:
+        1. If InputSpec.name is not None, do nothing.
+        2. If each argument `x` corresponds to an InputSpec, using the argument name like `x`
+        3. If the arguments `inputs` corresponds to a list(InputSpec), using name like `inputs_0`, `inputs_1`
+        4. If the arguments `input_dic` corresponds to a dict(InputSpec), using key as name.
+
+    For example:
+        
+        # case 1: foo(x, y)
+        foo = to_static(foo, input_spec=[InputSpec([None, 10]), InputSpec([None])])
+        print([in_var.name for in_var in foo.inputs])  # [x, y]
+
+        # case 2: foo(inputs) where inputs is a list
+        foo = to_static(foo, input_spec=[[InputSpec([None, 10]), InputSpec([None])]])
+        print([in_var.name for in_var in foo.inputs])  # [inputs_0, inputs_1]
+
+        # case 3: foo(inputs) where inputs is a dict
+        foo = to_static(foo, input_spec=[{'x': InputSpec([None, 10]), 'y': InputSpec([None])}])
+        print([in_var.name for in_var in foo.inputs])  # [x, y]
+    """
+    input_with_spec = list(input_with_spec)
+    candidate_arg_names = args_name[:len(input_with_spec)]
+
+    for i, arg_name in enumerate(candidate_arg_names):
+        input_spec = input_with_spec[i]
+        input_with_spec[i] = _replace_spec_name(arg_name, input_spec)
+
+    return input_with_spec
+
+
+def _replace_spec_name(name, input_spec):
+    """
+    Replaces InputSpec.name with given `name` while not specificing it.
+    """
+    if isinstance(input_spec, paddle.static.InputSpec):
+        if input_spec.name is None:
+            input_spec.name = name
+        return input_spec
+    elif isinstance(input_spec, (list, tuple)):
+        processed_specs = []
+        for i, spec in enumerate(input_spec):
+            new_name = "{}_{}".format(name, i)
+            processed_specs.append(_replace_spec_name(new_name, spec))
+        return processed_specs
+    elif isinstance(input_spec, dict):
+        processed_specs = {}
+        for key, spec in six.iteritems(input_spec):
+            processed_specs[key] = _replace_spec_name(key, spec)
+        return processed_specs
+    else:
+        return input_spec
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index d520fe61888cf3b11efc61d67ce566a3407dc6ff..ad2088116243e3a0a75fb43b9bb34b19456c84de 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -37,7 +37,7 @@ from paddle.fluid.wrapped_decorator import wrap_decorator
 
 __all__ = [
     'TracedLayer', 'declarative', 'dygraph_to_static_func', 'set_code_level',
-    'set_verbosity'
+    'set_verbosity', 'save', 'load', 'SaveLoadConfig'
 ]
 
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 5281df9ead10acea5ae8656dcc4a0eed14fb3e83..797b32f5d4768af59fa4e6aceb75e4b6d9029d91 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -217,7 +217,7 @@ def _dygraph_not_support_(func):
 def _dygraph_only_(func):
     def __impl__(*args, **kwargs):
         assert in_dygraph_mode(
-        ), "We Only support %s in imperative mode, please use fluid.dygraph.guard() as context to run it in imperative Mode" % func.__name__
+        ), "We Only support %s in dynamic mode, please call 'paddle.disable_static()' to enter dynamic mode." % func.__name__
         return func(*args, **kwargs)
 
     return __impl__
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/vars_metatools.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/vars_metatools.py
index c8f3643b25be0780bbdfd1668d849ab00ece355c..c80b4a800bd149cced13c0e25322cd03ef94e468 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/vars_metatools.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/vars_metatools.py
@@ -12,9 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import print_function
+from functools import reduce
+
 from paddle.fluid.framework import Variable
 from paddle.fluid import core
 
+dtype_to_size = {
+    core.VarDesc.VarType.FP16: 2,
+    core.VarDesc.VarType.FP32: 4,
+    core.VarDesc.VarType.FP64: 8,
+    core.VarDesc.VarType.INT16: 2,
+    core.VarDesc.VarType.INT32: 4,
+    core.VarDesc.VarType.INT64: 8,
+    core.VarDesc.VarType.BOOL: 1,
+    core.VarDesc.VarType.UINT8: 1,
+}
+
 
 class VarBlock:
     def __init__(self, varname, offset, size):
@@ -51,11 +64,14 @@ class VarStruct(object):
         self.type = type
         self.lod_level = lod_level
         self.persistable = persistable
+        self.m_size = 1
+        self.m_size = reduce(lambda x, y: x * y, shape)
+        self.m_size *= dtype_to_size[dtype]
 
     def __str__(self):
-        return "N: {}, S: {}, D: {}, T: {}, LL: {}, P: {}".format(
+        return "N: {}, S: {}, D: {}, T: {}, LL: {}, P: {}, M: {}".format(
             self.name, self.shape, self.dtype, self.type, self.lod_level,
-            self.persistable)
+            self.persistable, self.m_size)
 
 
 class VarDistributed(object):
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index ea6abe2d335e6669b27ba278c0faaca62ca0fdbb..bf87d1fc5a947e48845a3783fd71922641e28819 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -20,7 +20,8 @@ from __future__ import print_function
 from .layer_function_generator import generate_layer_fn
 from .layer_function_generator import autodoc, templatedoc
 from ..layer_helper import LayerHelper
-from ..framework import Variable
+from ..framework import Variable, in_dygraph_mode
+from .. import core
 from .loss import softmax_with_cross_entropy
 from . import tensor
 from . import nn
@@ -2893,8 +2894,8 @@ def generate_proposals(scores,
                        nms_thresh=0.5,
                        min_size=0.1,
                        eta=1.0,
-                       name=None,
-                       return_rois_num=False):
+                       return_rois_num=False,
+                       name=None):
     """
 	:alias_main: paddle.nn.functional.generate_proposals
 	:alias: paddle.nn.functional.generate_proposals,paddle.nn.functional.vision.generate_proposals
@@ -2949,6 +2950,10 @@ def generate_proposals(scores,
             num of each image in one batch. The N is the image's num. For example, the tensor has values [4,5] that represents
             the first image has 4 Rois, the second image has 5 Rois. It only used in rcnn model. 
             'False' by default. 
+        name(str, optional): For detailed information, please refer 
+            to :ref:`api_guide_Name`. Usually name is no need to set and 
+            None by default. 
+
     Returns:
         tuple:
         A tuple with format ``(rpn_rois, rpn_roi_probs)``.
@@ -2969,6 +2974,14 @@ def generate_proposals(scores,
                          im_info, anchors, variances)
 
     """
+    if in_dygraph_mode():
+        assert return_rois_num, "return_rois_num should be True in dygraph mode."
+        attrs = ('pre_nms_topN', pre_nms_top_n, 'post_nms_topN', post_nms_top_n,
+                 'nms_thresh', nms_thresh, 'min_size', min_size, 'eta', eta)
+        rpn_rois, rpn_roi_probs, rpn_rois_num = core.ops.generate_proposals(
+            scores, bbox_deltas, im_info, anchors, variances, *attrs)
+        return rpn_rois, rpn_roi_probs, rpn_rois_num
+
     helper = LayerHelper('generate_proposals', **locals())
 
     check_variable_and_dtype(scores, 'scores', ['float32'],
@@ -2986,7 +2999,14 @@ def generate_proposals(scores,
         dtype=bbox_deltas.dtype)
     rpn_roi_probs = helper.create_variable_for_type_inference(
         dtype=scores.dtype)
-    rpn_rois_lod = helper.create_variable_for_type_inference(dtype='int32')
+    outputs = {
+        'RpnRois': rpn_rois,
+        'RpnRoiProbs': rpn_roi_probs,
+    }
+    if return_rois_num:
+        rpn_rois_num = helper.create_variable_for_type_inference(dtype='int32')
+        rpn_rois_num.stop_gradient = True
+        outputs['RpnRoisNum'] = rpn_rois_num
 
     helper.append_op(
         type="generate_proposals",
@@ -3004,17 +3024,12 @@ def generate_proposals(scores,
             'min_size': min_size,
             'eta': eta
         },
-        outputs={
-            'RpnRois': rpn_rois,
-            'RpnRoiProbs': rpn_roi_probs,
-            'RpnRoisLod': rpn_rois_lod
-        })
+        outputs=outputs)
     rpn_rois.stop_gradient = True
     rpn_roi_probs.stop_gradient = True
-    rpn_rois_lod.stop_gradient = True
 
     if return_rois_num:
-        return rpn_rois, rpn_roi_probs, rpn_rois_lod
+        return rpn_rois, rpn_roi_probs, rpn_rois_num
     else:
         return rpn_rois, rpn_roi_probs
 
@@ -3656,6 +3671,7 @@ def distribute_fpn_proposals(fpn_rois,
                              max_level,
                              refer_level,
                              refer_scale,
+                             rois_num=None,
                              name=None):
     """
 	:alias_main: paddle.nn.functional.distribute_fpn_proposals
@@ -3687,6 +3703,11 @@ def distribute_fpn_proposals(fpn_rois,
             come from.
         refer_level(int32): The referring level of FPN layer with specified scale.
         refer_scale(int32): The referring scale of FPN layer with specified level.
+        rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. 
+            The shape is [B] and data type is int32. B is the number of images.
+            If it is not None then return a list of 1-D Tensor. Each element 
+            is the output RoIs' number of each image on the corresponding level
+            and the shape is [B]. None by default.
         name(str, optional): For detailed information, please refer 
             to :ref:`api_guide_Name`. Usually name is no need to set and 
             None by default. 
@@ -3702,6 +3723,10 @@ def distribute_fpn_proposals(fpn_rois,
         the number of total rois. The data type is int32. It is
         used to restore the order of fpn_rois.
 
+        rois_num_per_level(List): A list of 1-D Tensor and each Tensor is 
+        the RoIs' number in each image on the corresponding level. The shape 
+        is [B] and data type of int32. B is the number of images
+
 
     Examples:
         .. code-block:: python
@@ -3716,26 +3741,52 @@ def distribute_fpn_proposals(fpn_rois,
                 refer_level=4,
                 refer_scale=224)
     """
+    num_lvl = max_level - min_level + 1
+
+    if in_dygraph_mode():
+        assert rois_num is not None, "rois_num should not be None in dygraph mode."
+        attrs = ('min_level', min_level, 'max_level', max_level, 'refer_level',
+                 refer_level, 'refer_scale', refer_scale)
+        multi_rois, restore_ind, rois_num_per_level = core.ops.distribute_fpn_proposals(
+            fpn_rois, rois_num, num_lvl, num_lvl, *attrs)
+        return multi_rois, restore_ind, rois_num_per_level
+
     check_variable_and_dtype(fpn_rois, 'fpn_rois', ['float32', 'float64'],
                              'distribute_fpn_proposals')
     helper = LayerHelper('distribute_fpn_proposals', **locals())
     dtype = helper.input_dtype('fpn_rois')
-    num_lvl = max_level - min_level + 1
     multi_rois = [
         helper.create_variable_for_type_inference(dtype) for i in range(num_lvl)
     ]
+
     restore_ind = helper.create_variable_for_type_inference(dtype='int32')
+
+    inputs = {'FpnRois': fpn_rois}
+    outputs = {
+        'MultiFpnRois': multi_rois,
+        'RestoreIndex': restore_ind,
+    }
+
+    if rois_num is not None:
+        inputs['RoisNum'] = rois_num
+        rois_num_per_level = [
+            helper.create_variable_for_type_inference(dtype='int32')
+            for i in range(num_lvl)
+        ]
+        outputs['MultiLevelRoIsNum'] = rois_num_per_level
+
     helper.append_op(
         type='distribute_fpn_proposals',
-        inputs={'FpnRois': fpn_rois},
-        outputs={'MultiFpnRois': multi_rois,
-                 'RestoreIndex': restore_ind},
+        inputs=inputs,
+        outputs=outputs,
         attrs={
             'min_level': min_level,
             'max_level': max_level,
             'refer_level': refer_level,
             'refer_scale': refer_scale
         })
+    if rois_num is not None:
+        return multi_rois, restore_ind, rois_num_per_level
     return multi_rois, restore_ind
 
 
@@ -3820,6 +3871,7 @@ def collect_fpn_proposals(multi_rois,
                           min_level,
                           max_level,
                           post_nms_top_n,
+                          rois_num_per_level=None,
                           name=None):
     """
 	:alias_main: paddle.nn.functional.collect_fpn_proposals
@@ -3846,6 +3898,12 @@ def collect_fpn_proposals(multi_rois,
         min_level(int): The lowest level of FPN layer to collect
         max_level(int): The highest level of FPN layer to collect
         post_nms_top_n(int): The number of selected RoIs
+        rois_num_per_level(list, optional): The List of RoIs' numbers. 
+            Each element is 1-D Tensor which contains the RoIs' number of each 
+            image on each level and the shape is [B] and data type is 
+            int32, B is the number of images. If it is not None then return 
+            a 1-D Tensor contains the output RoIs' number of each image and 
+            the shape is [B]. Default: None
         name(str, optional): For detailed information, please refer 
             to :ref:`api_guide_Name`. Usually name is no need to set and 
             None by default.        
@@ -3856,6 +3914,9 @@ def collect_fpn_proposals(multi_rois,
         fpn_rois(Variable): 2-D LoDTensor with shape [N, 4] and data type is 
         float32 or float64. Selected RoIs. 
 
+        rois_num(Tensor): 1-D Tensor contains the RoIs's number of each 
+        image. The shape is [B] and data type is int32. B is the number of 
+        images. 
 
     Examples:
         .. code-block:: python
@@ -3879,21 +3940,38 @@ def collect_fpn_proposals(multi_rois,
     """
     check_type(multi_rois, 'multi_rois', list, 'collect_fpn_proposals')
     check_type(multi_scores, 'multi_scores', list, 'collect_fpn_proposals')
+    num_lvl = max_level - min_level + 1
+    input_rois = multi_rois[:num_lvl]
+    input_scores = multi_scores[:num_lvl]
+
+    if in_dygraph_mode():
+        assert rois_num_per_level is not None, "rois_num_per_level should not be None in dygraph mode."
+        attrs = ('post_nms_topN', post_nms_top_n)
+        output_rois, rois_num = core.ops.collect_fpn_proposals(
+            input_rois, input_scores, rois_num_per_level, *attrs)
+
     helper = LayerHelper('collect_fpn_proposals', **locals())
     dtype = helper.input_dtype('multi_rois')
     check_dtype(dtype, 'multi_rois', ['float32', 'float64'],
                 'collect_fpn_proposals')
-    num_lvl = max_level - min_level + 1
-    input_rois = multi_rois[:num_lvl]
-    input_scores = multi_scores[:num_lvl]
     output_rois = helper.create_variable_for_type_inference(dtype)
     output_rois.stop_gradient = True
+
+    inputs = {
+        'MultiLevelRois': input_rois,
+        'MultiLevelScores': input_scores,
+    }
+    outputs = {'FpnRois': output_rois}
+    if rois_num_per_level is not None:
+        inputs['MultiLevelRoIsNum'] = rois_num_per_level
+        rois_num = helper.create_variable_for_type_inference(dtype='int32')
+        rois_num.stop_gradient = True
+        outputs['RoisNum'] = rois_num
     helper.append_op(
         type='collect_fpn_proposals',
-        inputs={
-            'MultiLevelRois': input_rois,
-            'MultiLevelScores': input_scores
-        },
-        outputs={'FpnRois': output_rois},
+        inputs=inputs,
+        outputs=outputs,
         attrs={'post_nms_topN': post_nms_top_n})
+    if rois_num_per_level is not None:
+        return output_rois, rois_num
     return output_rois
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 70f48e82fdf6564698098cbdf6e1290f7fba0e18..5a14b9fdc7b6d963f77eefc29476bc332f3938df 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -6862,7 +6862,8 @@ def roi_pool(input,
              pooled_height=1,
              pooled_width=1,
              spatial_scale=1.0,
-             rois_lod=None):
+             rois_num=None,
+             name=None):
     """
     :alias_main: paddle.nn.functional.roi_pool
 	:alias: paddle.nn.functional.roi_pool,paddle.nn.functional.vision.roi_pool
@@ -6882,10 +6883,14 @@ def roi_pool(input,
     Args:
         input (Variable): Input feature, 4D-Tensor with the shape of [N,C,H,W], where N is the batch size, C is the input channel, H is Height, W is weight. The data type is float32 or float64.
         rois (Variable): ROIs (Regions of Interest) to pool over. 2D-LoDTensor with the shape of [num_rois,4], the lod level is 1. Given as [[x1, y1, x2, y2], ...], (x1, y1) is the top left coordinates, and (x2, y2) is the bottom right coordinates.
-        rois_lod (Variable): The lod info of rois. Default: None
         pooled_height (int, optional): The pooled output height, data type is int32. Default: 1
         pooled_width (int, optional): The pooled output height, data type is int32. Default: 1
         spatial_scale (float, optional): Multiplicative spatial scale factor to translate ROI coords from their input scale to the scale used when pooling. Default: 1.0
+        rois_num (Tensor): The number of RoIs in each image. Default: None
+        name(str, optional): For detailed information, please refer
+            to :ref:`api_guide_Name`. Usually name is no need to set and
+            None by default.
+
 
     Returns:
         Variable: The pooled feature, 4D-Tensor with the shape of [num_rois, C, pooled_height, pooled_width].
@@ -6905,11 +6910,11 @@ def roi_pool(input,
 
         input_data = np.array([i for i in range(1,17)]).reshape(1,1,4,4).astype(DATATYPE)
         roi_data =fluid.create_lod_tensor(np.array([[1., 1., 2., 2.], [1.5, 1.5, 3., 3.]]).astype(DATATYPE),[[2]], place)
-        rois_lod_data = np.array([0, 2])
+        rois_num_data = np.array([2]).astype('int32')
 
         x = fluid.data(name='input', shape=[None,1,4,4], dtype=DATATYPE)
         rois = fluid.data(name='roi', shape=[None,4], dtype=DATATYPE)
-        rois_lod = fluid.data(name='rois_lod', shape=[None], dtype='int64')
+        rois_num = fluid.data(name='rois_num', shape=[None], dtype='int32')
 
         pool_out = fluid.layers.roi_pool(
                 input=x,
@@ -6917,24 +6922,36 @@ def roi_pool(input,
                 pooled_height=1,
                 pooled_width=1,
                 spatial_scale=1.0,
-                rois_lod=rois_lod)
+                rois_num=rois_num)
 
         exe = fluid.Executor(place)
-        out, = exe.run(feed={'input':input_data ,'roi':roi_data, 'rois_lod': rois_lod_data}, fetch_list=[pool_out.name])
+        out, = exe.run(feed={'input':input_data ,'roi':roi_data, 'rois_num': rois_num_data}, fetch_list=[pool_out.name])
         print(out)   #array([[[[11.]]], [[[16.]]]], dtype=float32)
         print(np.array(out).shape)  # (2, 1, 1, 1)
     """
+    if in_dygraph_mode():
+        assert rois_num is not None, "rois_num should not be None in dygraph mode."
+        pool_out, argmaxes = core.ops.roi_pool(
+            input, rois, rois_num, "pooled_height", pooled_height,
+            "pooled_width", pooled_width, "spatial_scale", spatial_scale)
+        return pool_out, argmaxes
+
     check_variable_and_dtype(input, 'input', ['float32'], 'roi_pool')
     check_variable_and_dtype(rois, 'rois', ['float32'], 'roi_pool')
     helper = LayerHelper('roi_pool', **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
     argmaxes = helper.create_variable_for_type_inference(dtype='int32')
+
+    inputs = {
+        "X": input,
+        "ROIs": rois,
+    }
+    if rois_num is not None:
+        inputs['RoisNum'] = rois_num
     helper.append_op(
         type="roi_pool",
-        inputs={"X": input,
-                "ROIs": rois,
-                "RoisLod": rois_lod},
+        inputs=inputs,
         outputs={"Out": pool_out,
                  "Argmax": argmaxes},
         attrs={
@@ -6952,8 +6969,8 @@ def roi_align(input,
               pooled_width=1,
               spatial_scale=1.0,
               sampling_ratio=-1,
-              name=None,
-              rois_lod=None):
+              rois_num=None,
+              name=None):
     """
     :alias_main: paddle.nn.functional.roi_align
 	:alias: paddle.nn.functional.roi_align,paddle.nn.functional.vision.roi_align
@@ -6968,11 +6985,11 @@ def roi_align(input,
             data type is float32 or float64. Given as [[x1, y1, x2, y2], ...],
             (x1, y1) is the top left coordinates, and (x2, y2) is the bottom
             right coordinates.
-        rois_lod (Variable): The lod info of rois. Default: None
         pooled_height (int32, optional): ${pooled_height_comment} Default: 1
         pooled_width (int32, optional): ${pooled_width_comment} Default: 1
         spatial_scale (float32, optional): ${spatial_scale_comment} Default: 1.0
         sampling_ratio(int32, optional): ${sampling_ratio_comment} Default: -1
+        rois_num (Tensor): The number of RoIs in each image. Default: None
         name(str, optional): For detailed information, please refer
             to :ref:`api_guide_Name`. Usually name is no need to set and
             None by default.
@@ -6991,26 +7008,38 @@ def roi_align(input,
                 name='data', shape=[None, 256, 32, 32], dtype='float32')
             rois = fluid.data(
                 name='rois', shape=[None, 4], dtype='float32')
-            rois_lod = fluid.data(name='rois_lod', shape=[None], dtype='int64')
+            rois_num = fluid.data(name='rois_num', shape=[None], dtype='int32')
             align_out = fluid.layers.roi_align(input=x,
                                                rois=rois,
                                                pooled_height=7,
                                                pooled_width=7,
                                                spatial_scale=0.5,
                                                sampling_ratio=-1,
-                                               rois_lod=rois_lod)
+                                               rois_num=rois_num)
     """
+    if in_dygraph_mode():
+        assert rois_num is not None, "rois_num should not be None in dygraph mode."
+        align_out = core.ops.roi_align(
+            input, rois, rois_num, "pooled_height", pooled_height,
+            "pooled_width", pooled_width, "spatial_scale", spatial_scale,
+            "sampling_ratio", sampling_ratio)
+        return align_out
+
     check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                              'roi_align')
     check_variable_and_dtype(rois, 'rois', ['float32', 'float64'], 'roi_align')
     helper = LayerHelper('roi_align', **locals())
     dtype = helper.input_dtype()
     align_out = helper.create_variable_for_type_inference(dtype)
+    inputs = {
+        "X": input,
+        "ROIs": rois,
+    }
+    if rois_num is not None:
+        inputs['RoisNum'] = rois_num
     helper.append_op(
         type="roi_align",
-        inputs={"X": input,
-                "ROIs": rois,
-                "RoisLod": rois_lod},
+        inputs=inputs,
         outputs={"Out": align_out},
         attrs={
             "pooled_height": pooled_height,
@@ -10850,8 +10879,7 @@ def slice(input, axes, starts, ends):
                 result = [ [2, 3, 4], ] # result = data[0:1, 1:4]
     Args:
         input (Variable): A ``Tensor`` or ``LoDTensor`` . The data type is ``float16``, ``float32``, ``float64``, ``int32`` or ``int64``.
-        axes (list|tuple): The data type is ``int32`` . Axes that `starts` and `ends` apply to.
-                            It's optional. If it is not provides, it will be treated as :math:`[0,1,...,len(starts)-1]`.
+        axes (list|tuple): The data type is ``int32`` . Axes that `starts` and `ends` apply to .
         starts (list|tuple|Variable): The data type is ``int32`` . If ``starts`` is a list or tuple, the elements of
                 it should be integers or Tensors with shape [1]. If ``starts`` is an Variable, it should be an 1-D Tensor.
                 It represents starting indices of corresponding axis in ``axes``.
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index a90551c1b7b4fd45ae9a0e1cfa225a87db811295..89acfc6075be0b625da04d187cd46dd47ac699c9 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -36,7 +36,7 @@ __all__ = [
     'tensor_array_to_tensor', 'concat', 'sums', 'assign',
     'fill_constant_batch_size_like', 'fill_constant', 'argmin', 'argmax',
     'argsort', 'ones', 'zeros', 'reverse', 'has_inf', 'has_nan', 'isfinite',
-    'range', 'linspace', 'zeros_like', 'ones_like', 'diag', 'eye'
+    'range', 'linspace', 'zeros_like', 'ones_like', 'diag', 'eye', 'triu'
 ]
 
 
@@ -1725,3 +1725,9 @@ def ones_like(x, out=None):
         attrs={'value': 1.0},
         outputs={'Out': [out]})
     return out
+
+
+@deprecated(since="2.0.0", update_to="paddle.triu")
+def triu(input, diagonal=0, name=None):
+    import paddle
+    return paddle.tensor.triu(x=input, diagonal=diagonal, name=name)
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index a1526934f4aa1415c529e79bfa8dea6c0754bea9..425c4e3c7e38cff2f892eff28428082b57b3727d 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -19,6 +19,57 @@ import paddle.fluid.layers as layers
 from paddle.fluid.layers import detection
 from paddle.fluid.framework import Program, program_guard
 import unittest
+import contextlib
+import numpy as np
+from unittests.test_imperative_base import new_program_scope
+from paddle.fluid.dygraph import base
+from paddle.fluid import core
+
+
+class LayerTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.seed = 111
+
+    @classmethod
+    def tearDownClass(cls):
+        pass
+
+    def _get_place(self, force_to_use_cpu=False):
+        # this option for ops that only have cpu kernel
+        if force_to_use_cpu:
+            return core.CPUPlace()
+        else:
+            if core.is_compiled_with_cuda():
+                return core.CUDAPlace(0)
+            return core.CPUPlace()
+
+    @contextlib.contextmanager
+    def static_graph(self):
+        with new_program_scope():
+            fluid.default_startup_program().random_seed = self.seed
+            fluid.default_main_program().random_seed = self.seed
+            yield
+
+    def get_static_graph_result(self,
+                                feed,
+                                fetch_list,
+                                with_lod=False,
+                                force_to_use_cpu=False):
+        exe = fluid.Executor(self._get_place(force_to_use_cpu))
+        exe.run(fluid.default_startup_program())
+        return exe.run(fluid.default_main_program(),
+                       feed=feed,
+                       fetch_list=fetch_list,
+                       return_numpy=(not with_lod))
+
+    @contextlib.contextmanager
+    def dynamic_graph(self, force_to_use_cpu=False):
+        with fluid.dygraph.guard(
+                self._get_place(force_to_use_cpu=force_to_use_cpu)):
+            fluid.default_startup_program().random_seed = self.seed
+            fluid.default_main_program().random_seed = self.seed
+            yield
 
 
 class TestDetection(unittest.TestCase):
@@ -481,45 +532,67 @@ class TestRpnTargetAssign(unittest.TestCase):
             print(str(program))
 
 
-class TestGenerateProposals(unittest.TestCase):
+class TestGenerateProposals(LayerTest):
     def test_generate_proposals(self):
-        program = Program()
-        with program_guard(program):
-            data_shape = [20, 64, 64]
-            images = fluid.layers.data(
-                name='images', shape=data_shape, dtype='float32')
-            im_info = fluid.layers.data(
-                name='im_info', shape=[3], dtype='float32')
-            anchors, variances = fluid.layers.anchor_generator(
-                name='anchor_generator',
-                input=images,
-                anchor_sizes=[32, 64],
-                aspect_ratios=[1.0],
-                variance=[0.1, 0.1, 0.2, 0.2],
-                stride=[16.0, 16.0],
-                offset=0.5)
-            num_anchors = anchors.shape[2]
-            scores = fluid.layers.data(
-                name='scores', shape=[num_anchors, 8, 8], dtype='float32')
-            bbox_deltas = fluid.layers.data(
-                name='bbox_deltas',
-                shape=[num_anchors * 4, 8, 8],
-                dtype='float32')
-            rpn_rois, rpn_roi_probs = fluid.layers.generate_proposals(
-                name='generate_proposals',
-                scores=scores,
-                bbox_deltas=bbox_deltas,
-                im_info=im_info,
-                anchors=anchors,
-                variances=variances,
-                pre_nms_top_n=6000,
-                post_nms_top_n=1000,
-                nms_thresh=0.5,
-                min_size=0.1,
-                eta=1.0)
-            self.assertIsNotNone(rpn_rois)
-            self.assertIsNotNone(rpn_roi_probs)
-            print(rpn_rois.shape)
+        scores_np = np.random.rand(2, 3, 4, 4).astype('float32')
+        bbox_deltas_np = np.random.rand(2, 12, 4, 4).astype('float32')
+        im_info_np = np.array([[8, 8, 0.5], [6, 6, 0.5]]).astype('float32')
+        anchors_np = np.reshape(np.arange(4 * 4 * 3 * 4),
+                                [4, 4, 3, 4]).astype('float32')
+        variances_np = np.ones((4, 4, 3, 4)).astype('float32')
+
+        with self.static_graph():
+            scores = fluid.data(
+                name='scores', shape=[2, 3, 4, 4], dtype='float32')
+            bbox_deltas = fluid.data(
+                name='bbox_deltas', shape=[2, 12, 4, 4], dtype='float32')
+            im_info = fluid.data(name='im_info', shape=[2, 3], dtype='float32')
+            anchors = fluid.data(
+                name='anchors', shape=[4, 4, 3, 4], dtype='float32')
+            variances = fluid.data(
+                name='var', shape=[4, 4, 3, 4], dtype='float32')
+            rois, roi_probs, rois_num = fluid.layers.generate_proposals(
+                scores,
+                bbox_deltas,
+                im_info,
+                anchors,
+                variances,
+                pre_nms_top_n=10,
+                post_nms_top_n=5,
+                return_rois_num=True)
+            rois_stat, roi_probs_stat, rois_num_stat = self.get_static_graph_result(
+                feed={
+                    'scores': scores_np,
+                    'bbox_deltas': bbox_deltas_np,
+                    'im_info': im_info_np,
+                    'anchors': anchors_np,
+                    'var': variances_np
+                },
+                fetch_list=[rois, roi_probs, rois_num],
+                with_lod=True)
+
+        with self.dynamic_graph():
+            scores_dy = base.to_variable(scores_np)
+            bbox_deltas_dy = base.to_variable(bbox_deltas_np)
+            im_info_dy = base.to_variable(im_info_np)
+            anchors_dy = base.to_variable(anchors_np)
+            variances_dy = base.to_variable(variances_np)
+            rois, roi_probs, rois_num = fluid.layers.generate_proposals(
+                scores_dy,
+                bbox_deltas_dy,
+                im_info_dy,
+                anchors_dy,
+                variances_dy,
+                pre_nms_top_n=10,
+                post_nms_top_n=5,
+                return_rois_num=True)
+            rois_dy = rois.numpy()
+            roi_probs_dy = roi_probs.numpy()
+            rois_num_dy = rois_num.numpy()
+
+        self.assertTrue(np.array_equal(np.array(rois_stat), rois_dy))
+        self.assertTrue(np.array_equal(np.array(roi_probs_stat), roi_probs_dy))
+        self.assertTrue(np.array_equal(np.array(rois_num_stat), rois_num_dy))
 
 
 class TestYoloDetection(unittest.TestCase):
@@ -648,30 +721,81 @@ class TestMulticlassNMS2(unittest.TestCase):
             self.assertIsNotNone(index)
 
 
-class TestCollectFpnPropsals(unittest.TestCase):
+class TestCollectFpnPropsals(LayerTest):
     def test_collect_fpn_proposals(self):
-        program = Program()
-        with program_guard(program):
+        multi_bboxes_np = []
+        multi_scores_np = []
+        rois_num_per_level_np = []
+        for i in range(4):
+            bboxes_np = np.random.rand(5, 4).astype('float32')
+            scores_np = np.random.rand(5, 1).astype('float32')
+            rois_num = np.array([2, 3]).astype('int32')
+            multi_bboxes_np.append(bboxes_np)
+            multi_scores_np.append(scores_np)
+            rois_num_per_level_np.append(rois_num)
+
+        with self.static_graph():
             multi_bboxes = []
             multi_scores = []
+            rois_num_per_level = []
             for i in range(4):
-                bboxes = layers.data(
+                bboxes = fluid.data(
                     name='rois' + str(i),
-                    shape=[10, 4],
+                    shape=[5, 4],
                     dtype='float32',
-                    lod_level=1,
-                    append_batch_size=False)
-                scores = layers.data(
+                    lod_level=1)
+                scores = fluid.data(
                     name='scores' + str(i),
-                    shape=[10, 1],
+                    shape=[5, 1],
                     dtype='float32',
-                    lod_level=1,
-                    append_batch_size=False)
+                    lod_level=1)
+                rois_num = fluid.data(
+                    name='rois_num' + str(i), shape=[None], dtype='int32')
+
                 multi_bboxes.append(bboxes)
                 multi_scores.append(scores)
-            fpn_rois = layers.collect_fpn_proposals(multi_bboxes, multi_scores,
-                                                    2, 5, 10)
-            self.assertIsNotNone(fpn_rois)
+                rois_num_per_level.append(rois_num)
+
+            fpn_rois, rois_num = layers.collect_fpn_proposals(
+                multi_bboxes,
+                multi_scores,
+                2,
+                5,
+                10,
+                rois_num_per_level=rois_num_per_level)
+            feed = {}
+            for i in range(4):
+                feed['rois' + str(i)] = multi_bboxes_np[i]
+                feed['scores' + str(i)] = multi_scores_np[i]
+                feed['rois_num' + str(i)] = rois_num_per_level_np[i]
+            fpn_rois_stat, rois_num_stat = self.get_static_graph_result(
+                feed=feed, fetch_list=[fpn_rois, rois_num], with_lod=True)
+            fpn_rois_stat = np.array(fpn_rois_stat)
+            rois_num_stat = np.array(rois_num_stat)
+
+        with self.dynamic_graph():
+            multi_bboxes_dy = []
+            multi_scores_dy = []
+            rois_num_per_level_dy = []
+            for i in range(4):
+                bboxes_dy = base.to_variable(multi_bboxes_np[i])
+                scores_dy = base.to_variable(multi_scores_np[i])
+                rois_num_dy = base.to_variable(rois_num_per_level_np[i])
+                multi_bboxes_dy.append(bboxes_dy)
+                multi_scores_dy.append(scores_dy)
+                rois_num_per_level_dy.append(rois_num_dy)
+            fpn_rois_dy, rois_num_dy = fluid.layers.collect_fpn_proposals(
+                multi_bboxes_dy,
+                multi_scores_dy,
+                2,
+                5,
+                10,
+                rois_num_per_level=rois_num_per_level_dy)
+            fpn_rois_dy = fpn_rois_dy.numpy()
+            rois_num_dy = rois_num_dy.numpy()
+
+        self.assertTrue(np.array_equal(fpn_rois_stat, fpn_rois_dy))
+        self.assertTrue(np.array_equal(rois_num_stat, rois_num_dy))
 
     def test_collect_fpn_proposals_error(self):
         def generate_input(bbox_type, score_type, name):
@@ -717,20 +841,51 @@ class TestCollectFpnPropsals(unittest.TestCase):
                 post_nms_top_n=2000)
 
 
-class TestDistributeFpnProposals(unittest.TestCase):
+class TestDistributeFpnProposals(LayerTest):
     def test_distribute_fpn_proposals(self):
-        program = Program()
-        with program_guard(program):
-            fpn_rois = fluid.layers.data(
-                name='data', shape=[4], dtype='float32', lod_level=1)
-            multi_rois, restore_ind = layers.distribute_fpn_proposals(
-                fpn_rois=fpn_rois,
+        rois_np = np.random.rand(10, 4).astype('float32')
+        rois_num_np = np.array([4, 6]).astype('int32')
+        with self.static_graph():
+            rois = fluid.data(name='rois', shape=[10, 4], dtype='float32')
+            rois_num = fluid.data(name='rois_num', shape=[None], dtype='int32')
+            multi_rois, restore_ind, rois_num_per_level = layers.distribute_fpn_proposals(
+                fpn_rois=rois,
                 min_level=2,
                 max_level=5,
                 refer_level=4,
-                refer_scale=224)
-            self.assertIsNotNone(multi_rois)
-            self.assertIsNotNone(restore_ind)
+                refer_scale=224,
+                rois_num=rois_num)
+            fetch_list = multi_rois + [restore_ind] + rois_num_per_level
+            output_stat = self.get_static_graph_result(
+                feed={'rois': rois_np,
+                      'rois_num': rois_num_np},
+                fetch_list=fetch_list,
+                with_lod=True)
+            output_stat_np = []
+            for output in output_stat:
+                output_np = np.array(output)
+                if len(output_np) > 0:
+                    output_stat_np.append(output_np)
+
+        with self.dynamic_graph():
+            rois_dy = base.to_variable(rois_np)
+            rois_num_dy = base.to_variable(rois_num_np)
+            multi_rois_dy, restore_ind_dy, rois_num_per_level_dy = layers.distribute_fpn_proposals(
+                fpn_rois=rois_dy,
+                min_level=2,
+                max_level=5,
+                refer_level=4,
+                refer_scale=224,
+                rois_num=rois_num_dy)
+            output_dy = multi_rois_dy + [restore_ind_dy] + rois_num_per_level_dy
+            output_dy_np = []
+            for output in output_dy:
+                output_np = output.numpy()
+                if len(output_np) > 0:
+                    output_dy_np.append(output_np)
+
+        for res_stat, res_dy in zip(output_stat_np, output_dy_np):
+            self.assertTrue(np.array_equal(res_stat, res_dy))
 
     def test_distribute_fpn_proposals_error(self):
         program = Program()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 935813251930b8093dd0c8a9f11ac30772133d20..db472ec01662c2a93e6d7f4b2c2a577544b6a90a 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -440,6 +440,8 @@ if(WITH_DISTRIBUTE)
     # FIXME(seiriosX) will fix this
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_sparse_embedding_ctr")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_gloo")
+    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_a_sync_optimizer_auto")
+    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_ctr")
 
     py_test_modules(test_recv_save_op MODULES test_recv_save_op ENVS ${dist_ENVS})
     py_test_modules(test_transpiler_ops MODULES test_transpiler_ops ENVS ${dist_ENVS})
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
index 0b8df63d666b6547d5dccfc2ce0b420d653cc544..eed02ea655ed406fe59d8117e4fd836dfe1bfd4e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
@@ -47,8 +47,8 @@ class SimpleNet(Layer):
         return z
 
     @declarative(input_spec=[[InputSpec([None, 10]), InputSpec([None, 10])]])
-    def func_with_list(self, l):
-        x, y, int_val = l
+    def func_with_list(self, l, int_val=1):
+        x, y = l
         z = x + y
         z = z + int_val
         return z
@@ -60,10 +60,7 @@ class SimpleNet(Layer):
     def func_with_dict(self, d):
         x = d['x']
         y = d['y']
-        int_val = d['int_val']
-
         z = x + y
-        z = z + int_val
 
         return z
 
@@ -131,10 +128,10 @@ class TestInputSpec(unittest.TestCase):
             self.assertTrue(len(net.add_func.program_cache) == 1)
 
             # 5. test input with list
-            out = net.func_with_list([x, y, int_val])
+            out = net.func_with_list([x, y], int_val)
 
             # 6. test input with dict
-            out = net.func_with_dict({'x': x, 'y': y, 'int_val': int_val})
+            out = net.func_with_dict({'x': x, 'y': y})
 
             # 7. test input with lits contains dict
             int_np = np.ones([1]).astype('float32')
@@ -293,6 +290,30 @@ class TestDifferentInputSpecCacheProgram(unittest.TestCase):
                 foo_3.concrete_program
 
 
+class TestInputDefaultName(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.net = SimpleNet()
+
+    def assert_default_name(self, func_name, input_names):
+        decorated_func = getattr(self.net, func_name)
+
+        spec_names = [x.name for x in decorated_func.inputs]
+        self.assertListEqual(spec_names, input_names)
+
+    def test_common_input(self):
+        self.assert_default_name('forward', ['x'])
+
+    def test_list_input(self):
+        self.assert_default_name('func_with_list', ['l_0', 'l_1'])
+
+    def test_dict_input(self):
+        self.assert_default_name('func_with_dict', ['x', 'y'])
+
+    def test_nest_input(self):
+        self.assert_default_name('func_with_list_dict', ['dl_0', 'x', 'y'])
+
+
 class TestDeclarativeAPI(unittest.TestCase):
     def test_error(self):
         func = declarative(dyfunc_to_variable)
diff --git a/python/paddle/fluid/tests/unittests/test_collect_fpn_proposals_op.py b/python/paddle/fluid/tests/unittests/test_collect_fpn_proposals_op.py
index 034bb7f8dc7e00a321b6c6a5a4776fa4f7398ab5..a2f56c428012c615dcf55b6832a54ca473771d08 100644
--- a/python/paddle/fluid/tests/unittests/test_collect_fpn_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_collect_fpn_proposals_op.py
@@ -33,10 +33,14 @@ class TestCollectFPNProposalstOp(OpTest):
                     for i in range(self.num_level)]
         self.inputs = {
             'MultiLevelRois': inputs_x,
-            "MultiLevelScores": self.scores_input
+            "MultiLevelScores": self.scores_input,
+            'MultiLevelRoIsNum': []
         }
         self.attrs = {'post_nms_topN': self.post_nms_top_n, }
-        self.outputs = {'FpnRois': (self.rois, [self.lod])}
+        self.outputs = {
+            'FpnRois': (self.rois, [self.lod]),
+            'RoisNum': np.array(self.lod).astype('int32')
+        }
 
     def init_test_case(self):
         self.post_nms_top_n = 20
@@ -96,5 +100,32 @@ class TestCollectFPNProposalstOp(OpTest):
         self.check_output(check_dygraph=False)
 
 
+class TestCollectFPNProposalstOpWithRoisNum(TestCollectFPNProposalstOp):
+    def set_data(self):
+        self.init_test_case()
+        self.make_rois()
+        self.scores_input = [('y%d' % i,
+                              (self.scores[i].reshape(-1, 1), self.rois_lod[i]))
+                             for i in range(self.num_level)]
+        self.rois, self.lod = self.calc_rois_collect()
+        inputs_x = [('x%d' % i, (self.roi_inputs[i][:, 1:], self.rois_lod[i]))
+                    for i in range(self.num_level)]
+        rois_num_per_level = [
+            ('rois%d' % i, np.array(self.rois_lod[i][0]).astype('int32'))
+            for i in range(self.num_level)
+        ]
+
+        self.inputs = {
+            'MultiLevelRois': inputs_x,
+            "MultiLevelScores": self.scores_input,
+            'MultiLevelRoIsNum': rois_num_per_level
+        }
+        self.attrs = {'post_nms_topN': self.post_nms_top_n, }
+        self.outputs = {
+            'FpnRois': (self.rois, [self.lod]),
+            'RoisNum': np.array(self.lod).astype('int32')
+        }
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab47659a88de44f60291ef056c0fd5ed2e01b5f2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+import paddle.distributed.fleet.base.role_maker as role_maker
+import time
+
+
+class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+            "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_a_sync_optimizer1(self):
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        import paddle.distributed.fleet as fleet
+
+        main_program = paddle.fluid.Program()
+        startup_program = paddle.fluid.Program()
+
+        paddle.fluid.framework.switch_main_program(main_program)
+        paddle.fluid.framework.switch_startup_program(startup_program)
+
+        fleet.init(role_maker.PaddleCloudRoleMaker())
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.auto = True
+        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        self.assertTrue(optimizer.user_defined_strategy.a_sync)
+        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
+        self.assertTrue(a_sync_configs['k_steps'] == 0)
+
+    def test_a_sync_optimizer2(self):
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        import paddle.distributed.fleet as fleet
+
+        main_program = paddle.fluid.Program()
+        startup_program = paddle.fluid.Program()
+
+        paddle.fluid.framework.switch_main_program(main_program)
+        paddle.fluid.framework.switch_startup_program(startup_program)
+
+        fleet.init(role_maker.PaddleCloudRoleMaker())
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.auto = True
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        self.assertTrue(optimizer.user_defined_strategy.a_sync)
+        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
+        self.assertTrue(a_sync_configs['k_steps'] == 800)
+
+    def test_a_sync_optimizer3(self):
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        import paddle.distributed.fleet as fleet
+
+        main_program = paddle.fluid.Program()
+        startup_program = paddle.fluid.Program()
+
+        paddle.fluid.framework.switch_main_program(main_program)
+        paddle.fluid.framework.switch_startup_program(startup_program)
+
+        fleet.init(role_maker.PaddleCloudRoleMaker())
+        input_x = paddle.fluid.layers.data(
+            name="x",
+            shape=[-1, 1],
+            dtype="int64",
+            lod_level=1,
+            append_batch_size=False)
+        x_embedding = paddle.fluid.layers.embedding(
+            is_distributed=False,
+            input=input_x,
+            size=[1000000000, 100000],
+            param_attr=paddle.fluid.ParamAttr(
+                name="embedding",
+                initializer=paddle.fluid.initializer.Constant(value=0.01)),
+            is_sparse=True)
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=x_embedding, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.auto = True
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        self.assertTrue(optimizer.user_defined_strategy.a_sync)
+        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
+        self.assertTrue(a_sync_configs['k_steps'] == 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
index beb0069eb770f25d7834749ff9c188e5252e13c0..3a923dbf3f72e28c64c3f01d22d4d6f2d897f79b 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
@@ -76,9 +76,10 @@ class FleetDistRunnerBase(object):
         return role
 
     def build_strategy(self, args):
-        self.strategy = paddle.distributed.fleet.DistributedStrategy()
-        self.strategy.a_sync = False
-        if args.mode == "async":
+        if args.mode == "sync":
+            self.strategy = paddle.distributed.fleet.DistributedStrategy()
+            self.strategy.a_sync = False
+        elif args.mode == "async":
             self.strategy = paddle.distributed.fleet.DistributedStrategy()
             self.strategy.a_sync = True
         elif args.mode == "geo":
@@ -87,6 +88,10 @@ class FleetDistRunnerBase(object):
             self.strategy.a_sync_configs = {
                 "k_steps": args.geo_sgd_need_push_nums
             }
+        elif args.mode == "auto":
+            self.strategy = paddle.distributed.fleet.DistributedStrategy()
+            self.strategy.auto = True
+
         self.dump_param = os.getenv("dump_param", "").split(",")
         self.dump_fields = os.getenv("dump_fields", "").split(",")
         self.dump_fields_path = os.getenv("dump_fields_path", "")
@@ -232,14 +237,17 @@ class TestFleetBase(unittest.TestCase):
         tr0_pipe = open(tempfile.gettempdir() + "/tr0_err.log", "wb+")
         tr1_pipe = open(tempfile.gettempdir() + "/tr1_err.log", "wb+")
 
+        tr0_out = open(tempfile.gettempdir() + "/tr0_stdout.log", "wb+")
+        tr1_out = open(tempfile.gettempdir() + "/tr1_stdout.log", "wb+")
+
         tr0_proc = subprocess.Popen(
             tr0_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
+            stdout=tr0_out,
             stderr=tr0_pipe,
             env=required_envs)
         tr1_proc = subprocess.Popen(
             tr1_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
+            stdout=tr1_out,
             stderr=tr1_pipe,
             env=required_envs)
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
index e2336caac1c07f555280b82ba8fcfa7e5ec7f5b8..02ca0588e7452d44817f6c288ea9cf77b80dbfe8 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
@@ -52,6 +52,38 @@ class TestDistMnistSync2x2(TestFleetBase):
             "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
 
 
+class TestDistMnistAuto2x2(TestFleetBase):
+    def _setup_config(self):
+        self._mode = "auto"
+        self._reader = "pyreader"
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
+            "http_proxy": "",
+            "CPU_NUM": "2"
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
+
+    def test_dist_train(self):
+        self.check_with_place(
+            "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
+
+
 class TestDistMnistAsync2x2(TestFleetBase):
     def _setup_config(self):
         self._mode = "async"
diff --git a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
index 55b21f1a722f822f1bfcb7bbbda645109092b8a3..ec0125b28ed1b870025adbfd2bd4ba78244bcc11 100644
--- a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
@@ -35,9 +35,10 @@ class TestDistributeFPNProposalsOp(OpTest):
         }
         output = [('out%d' % i, self.rois_fpn[i])
                   for i in range(len(self.rois_fpn))]
+
         self.outputs = {
             'MultiFpnRois': output,
-            'RestoreIndex': self.rois_idx_restore.reshape(-1, 1)
+            'RestoreIndex': self.rois_idx_restore.reshape(-1, 1),
         }
 
     def init_test_case(self):
@@ -117,5 +118,34 @@ class TestDistributeFPNProposalsOp(OpTest):
         self.check_output()
 
 
+class TestDistributeFPNProposalsOpWithRoisNum(TestDistributeFPNProposalsOp):
+    def set_data(self):
+        self.init_test_case()
+        self.make_rois()
+        self.rois_fpn, self.rois_idx_restore = self.calc_rois_distribute()
+        self.inputs = {
+            'FpnRois': (self.rois[:, 1:5], self.rois_lod),
+            'RoisNum': np.array(self.rois_lod[0]).astype('int32')
+        }
+        self.attrs = {
+            'max_level': self.roi_max_level,
+            'min_level': self.roi_min_level,
+            'refer_scale': self.canonical_scale,
+            'refer_level': self.canonical_level
+        }
+        output = [('out%d' % i, self.rois_fpn[i])
+                  for i in range(len(self.rois_fpn))]
+        rois_num_per_level = [
+            ('rois_num%d' % i, np.array(self.rois_fpn[i][1][0]).astype('int32'))
+            for i in range(len(self.rois_fpn))
+        ]
+
+        self.outputs = {
+            'MultiFpnRois': output,
+            'RestoreIndex': self.rois_idx_restore.reshape(-1, 1),
+            'MultiLevelRoIsNum': rois_num_per_level
+        }
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
index ce561cd317c48228c4877a2b65b67fe049a0d84a..26fc01ca04506758599ac5d6fe6842984a8d7a9c 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
@@ -34,18 +34,18 @@ def generate_proposals_in_python(scores, bbox_deltas, im_info, anchors,
 
     rpn_rois = []
     rpn_roi_probs = []
-    lod = []
+    rois_num = []
     num_images = scores.shape[0]
     for img_idx in range(num_images):
         img_i_boxes, img_i_probs = proposal_for_one_image(
             im_info[img_idx, :], all_anchors, variances,
             bbox_deltas[img_idx, :, :, :], scores[img_idx, :, :, :],
             pre_nms_topN, post_nms_topN, nms_thresh, min_size, eta)
-        lod.append(img_i_probs.shape[0])
+        rois_num.append(img_i_probs.shape[0])
         rpn_rois.append(img_i_boxes)
         rpn_roi_probs.append(img_i_probs)
 
-    return rpn_rois, rpn_roi_probs, lod
+    return rpn_rois, rpn_roi_probs, rois_num
 
 
 def proposal_for_one_image(im_info, all_anchors, variances, bbox_deltas, scores,
@@ -87,6 +87,10 @@ def proposal_for_one_image(im_info, all_anchors, variances, bbox_deltas, scores,
     proposals = clip_tiled_boxes(proposals, im_info[:2])
     # remove predicted boxes with height or width < min_size
     keep = filter_boxes(proposals, min_size, im_info)
+    if len(keep) == 0:
+        proposals = np.zeros((1, 4)).astype('float32')
+        scores = np.zeros((1, 1)).astype('float32')
+        return proposals, scores
     proposals = proposals[keep, :]
     scores = scores[keep, :]
 
@@ -280,8 +284,8 @@ class TestGenerateProposalsOp(OpTest):
         }
 
         self.outputs = {
-            'RpnRois': (self.rpn_rois[0], [self.lod]),
-            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.lod]),
+            'RpnRois': (self.rpn_rois[0], [self.rois_num]),
+            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.rois_num]),
         }
 
     def test_check_output(self):
@@ -320,7 +324,7 @@ class TestGenerateProposalsOp(OpTest):
             (batch_size, num_anchors * 4, layer_h, layer_w)).astype('float32')
 
     def init_test_output(self):
-        self.rpn_rois, self.rpn_roi_probs, self.lod = generate_proposals_in_python(
+        self.rpn_rois, self.rpn_roi_probs, self.rois_num = generate_proposals_in_python(
             self.scores, self.bbox_deltas, self.im_info, self.anchors,
             self.variances, self.pre_nms_topN, self.post_nms_topN,
             self.nms_thresh, self.min_size, self.eta)
@@ -349,12 +353,21 @@ class TestGenerateProposalsOutLodOp(TestGenerateProposalsOp):
         }
 
         self.outputs = {
-            'RpnRois': (self.rpn_rois[0], [self.lod]),
-            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.lod]),
-            'RpnRoisLod': (np.asarray(
-                self.lod, dtype=np.int32))
+            'RpnRois': (self.rpn_rois[0], [self.rois_num]),
+            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.rois_num]),
+            'RpnRoisNum': (np.asarray(
+                self.rois_num, dtype=np.int32))
         }
 
 
+class TestGenerateProposalsOpNoBoxLeft(TestGenerateProposalsOp):
+    def init_test_params(self):
+        self.pre_nms_topN = 12000  # train 12000, test 2000
+        self.post_nms_topN = 5000  # train 6000, test 1000
+        self.nms_thresh = 0.7
+        self.min_size = 1000.0
+        self.eta = 1.
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_grid_sample_function.py b/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
index 4a33f32a0b6977716d8065419f8e0f88d6c4f44a..ea94a8ba69a784efb1a2a12f6f251316553cab50 100644
--- a/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
@@ -100,7 +100,7 @@ def add_cases(suite):
         GridSampleTestCase(
             methodName='runTest',
             mode='bilinear',
-            padding_mode='reflect',
+            padding_mode='reflection',
             align_corners=True))
     suite.addTest(
         GridSampleTestCase(
diff --git a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
index 4d1ed5aeb96ebbe064e35c1bee9d5775812440f7..bf2f9518fb0c720556b7eecdf5b286dea0fff96c 100644
--- a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
@@ -73,7 +73,7 @@ def unnormalizeAndClip(grid_slice, max_val, align_corners, padding_mode):
 
     if padding_mode == "border":
         grid_slice = clip(grid_slice, 0, max_val)
-    elif padding_mode == "reflect":
+    elif padding_mode == "reflection":
         double_range = 2 * max_val if align_corners else (max_val + 1) * 2
         grid_abs = np.abs(grid_slice) if align_corners else np.abs(grid_slice +
                                                                    0.5)
@@ -211,7 +211,7 @@ class Case2(TestGridSamplerOp):
         self.grid_shape = (2, 8, 9, 2)
         self.theta_shape = (2, 2, 3)
         self.align_corners = False
-        self.padding_mode = "reflect"
+        self.padding_mode = "reflection"
         self.mode = "bilinear"
 
 
@@ -221,7 +221,7 @@ class Case3(TestGridSamplerOp):
         self.grid_shape = (2, 8, 9, 2)
         self.theta_shape = (2, 2, 3)
         self.align_corners = True
-        self.padding_mode = "reflect"
+        self.padding_mode = "reflection"
         self.mode = "bilinear"
 
 
@@ -231,7 +231,7 @@ class Case4(TestGridSamplerOp):
         self.grid_shape = (2, 8, 9, 2)
         self.theta_shape = (2, 2, 3)
         self.align_corners = False
-        self.padding_mode = "reflect"
+        self.padding_mode = "reflection"
         self.mode = "nearest"
         self.numeric_grad_delta = 0.0001
 
diff --git a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
index 8780727e4cb276a989a8d04d05c6419a4874e7f5..041fe4e9043d60852fcaab42bc233b63b39609ce 100644
--- a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
@@ -24,7 +24,10 @@ def kldiv_loss(x, target, reduction):
     loss = np.where(target >= 0, output, np.zeros_like(x))
 
     if reduction == "batchmean":
-        return loss.sum() / x.shape[0]
+        if len(x.shape) > 0:
+            return loss.sum() / x.shape[0]
+        else:
+            return loss.sum()
     if reduction == "mean":
         return loss.mean()
     if reduction == "sum":
@@ -93,6 +96,9 @@ class TestKLDivLossDygraph(unittest.TestCase):
     def test_kl_loss_batchmean(self):
         self.run_kl_loss('batchmean')
 
+    def test_kl_loss_batchmean_shape(self):
+        self.run_kl_loss('batchmean', ())
+
     def test_kl_loss_mean(self):
         self.run_kl_loss('mean')
 
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index b76887f0965ca64b2b40bf9c0ce6e82b44fdad2f..89e9f7aad8581228411b1983580ced5566e65765 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -3318,15 +3318,29 @@ class TestBook(LayerTest):
             return (out)
 
     def test_roi_pool(self):
-        # TODO(minqiyang): dygraph do not support lod now
+        x_np = np.random.rand(2, 3, 8, 8).astype('float32')
+        rois_np = np.random.rand(3, 4).astype('float32')
+        rois_num_np = np.array([1, 2]).astype('int32')
+
         with self.static_graph():
-            x = layers.data(name="x", shape=[256, 30, 30], dtype="float32")
-            rois = layers.data(
-                name="rois", shape=[4], dtype="float32", lod_level=1)
-            rois_lod = layers.data(
-                name="rois_lod", shape=[None, ], dtype="int", lod_level=1)
-            output = layers.roi_pool(x, rois, 7, 7, 0.6, rois_lod)
-            return (output)
+            x = layers.data(name="x", shape=[3, 8, 8], dtype="float32")
+            rois = layers.data(name="rois", shape=[4], dtype="float32")
+            rois_num = fluid.data(name="rois_num", shape=[None], dtype="int32")
+            output = layers.roi_pool(x, rois, 4, 4, 0.5, rois_num=rois_num)
+            static_res = self.get_static_graph_result(
+                feed={'x': x_np,
+                      'rois': rois_np,
+                      'rois_num': rois_num_np},
+                fetch_list=[output])[0]
+
+        with self.dynamic_graph():
+            x_dy = base.to_variable(x_np)
+            rois_dy = base.to_variable(rois_np)
+            rois_num_dy = base.to_variable(rois_num_np)
+            dy_res = layers.roi_pool(
+                x_dy, rois_dy, 4, 4, 0.5, rois_num=rois_num_dy)
+            dy_res_value = dy_res[0].numpy()
+        self.assertTrue(np.array_equal(static_res, dy_res_value))
 
     def test_sequence_enumerate(self):
         # TODO(minqiyang): dygraph do not support lod now
@@ -3335,16 +3349,29 @@ class TestBook(LayerTest):
             out = layers.sequence_enumerate(input=x, win_size=2, pad_value=0)
 
     def test_roi_align(self):
-        # TODO(minqiyang): dygraph do not support lod now
+        x_np = np.random.rand(2, 3, 8, 8).astype('float32')
+        rois_np = np.random.rand(3, 4).astype('float32')
+        rois_num_np = np.array([1, 2]).astype('int32')
+
         with self.static_graph():
-            x = layers.data(name="x", shape=[256, 30, 30], dtype="float32")
-            rois = layers.data(
-                name="rois", shape=[4], dtype="float32", lod_level=1)
-            rois_lod = layers.data(
-                name="rois_lod", shape=[None, ], dtype="int", lod_level=1)
-            output = layers.roi_align(x, rois, 14, 14, 0.5, 2, 'roi_align',
-                                      rois_lod)
-            return (output)
+            x = layers.data(name="x", shape=[3, 8, 8], dtype="float32")
+            rois = layers.data(name="rois", shape=[4], dtype="float32")
+            rois_num = fluid.data(name="rois_num", shape=[None], dtype="int32")
+            output = layers.roi_align(x, rois, 4, 4, 0.5, 2, rois_num=rois_num)
+            static_res = self.get_static_graph_result(
+                feed={'x': x_np,
+                      'rois': rois_np,
+                      'rois_num': rois_num_np},
+                fetch_list=[output])[0]
+
+        with self.dynamic_graph():
+            x_dy = base.to_variable(x_np)
+            rois_dy = base.to_variable(rois_np)
+            rois_num_dy = base.to_variable(rois_num_np)
+            dy_res = layers.roi_align(
+                x_dy, rois_dy, 4, 4, 0.5, 2, rois_num=rois_num_dy)
+            dy_res_value = dy_res.numpy()
+        self.assertTrue(np.array_equal(static_res, dy_res_value))
 
     def test_roi_perspective_transform(self):
         # TODO(minqiyang): dygraph do not support lod now
diff --git a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py
index e0edf9019356f38eb3c74b9cadfa6ae575e9b823..43a0d481b28fdc47dec52fe9763dd920fd5a76a2 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py
@@ -16,20 +16,49 @@ from __future__ import print_function
 
 import unittest
 
+import paddle
+import paddle.nn as nn
+import numpy as np
+
+paddle.disable_static()
+
 
 class EmbeddingDygraph(unittest.TestCase):
     def test_1(self):
-        import paddle
-        import paddle.nn as nn
-        import numpy as np
-        paddle.disable_static()
+        x_data = np.arange(3, 6).reshape((3, 1)).astype(np.int64)
+        y_data = np.arange(6, 12).reshape((3, 2)).astype(np.float32)
+        paddle.disable_static(paddle.CPUPlace())
+        x = paddle.to_tensor(x_data, stop_gradient=False)
+        y = paddle.to_tensor(y_data, stop_gradient=False)
+
+        embedding = paddle.nn.Embedding(10, 3, sparse=True)
+
+        w0 = np.full(shape=(10, 3), fill_value=2).astype(np.float32)
+        embedding.weight.set_value(w0)
+
+        adam = paddle.optimizer.Adam(
+            parameters=[embedding.weight], learning_rate=0.01)
+        adam.clear_grad()
+
+        out = embedding(x)
+        out.backward()
+        adam.step()
+
+    def test_2(self):
+        x_data = np.arange(3, 6).reshape((3, 1)).astype(np.int64)
+        y_data = np.arange(6, 12).reshape((3, 2)).astype(np.float32)
+        paddle.disable_static(paddle.CPUPlace())
+        x = paddle.to_tensor(x_data, stop_gradient=False)
+        y = paddle.to_tensor(y_data, stop_gradient=False)
+
+        with self.assertRaises(ValueError):
+            embedding = paddle.nn.Embedding(10, 3, padding_idx=11, sparse=True)
 
-        # example 1
-        inp_word = np.array([[2, 3, 5], [4, 2, 1]]).astype('int64')
-        inp_word.shape  # [2, 3]
-        dict_size = 20
+        with self.assertRaises(ValueError):
+            embedding = paddle.nn.Embedding(-1, 3, sparse=True)
 
-        emb = nn.Embedding(dict_size, 32, weight_attr='emb.w', sparse=False)
+        with self.assertRaises(ValueError):
+            embedding = paddle.nn.Embedding(10, -3, sparse=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py
index c9c91ceb39de42c44f9ce81658aa79b896999552..4af0cce12b7334857c54ae9e8e9418848275ff32 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py
@@ -73,8 +73,13 @@ class EmbeddingStatic(unittest.TestCase):
                     dtype="int32")
 
                 emb = functional.embedding(
-                    x=label, weight=weight, sparse=True, name="embedding")
+                    x=label,
+                    weight=weight,
+                    padding_idx=129,
+                    sparse=True,
+                    name="embedding")
 
+        with self.assertRaises(ValueError):
             test_bad_x()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py
index c6cfe01dce40458684c7464ca5ebddd389c62cbe..0c39dc5e731d25720149af4480020a7ab3ac5bb9 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py
@@ -101,6 +101,29 @@ class TestReduceMeanWithDimDoubleGradCheck(unittest.TestCase):
             self.func(p)
 
 
+class TestReduceSumWithDimDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        shape = [7, 11]
+        eps = 0.05
+        dtype = np.float64
+
+        x = layers.data('x', shape, False, dtype)
+        x.persistable = True
+        y = layers.reduce_sum(x, dim=0)
+        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x], y, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 class TestMulDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
diff --git a/python/paddle/fluid/tests/unittests/test_norm_all.py b/python/paddle/fluid/tests/unittests/test_norm_all.py
index c047cf6ddff78641b918de75a284574175bb3bca..352089e1fb75fa4c3423d29012fd85c3d611c81b 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_all.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_all.py
@@ -26,11 +26,11 @@ def p_norm(x, axis, porder, keepdims=False):
     if axis is None:
         x = x.flatten()
         if porder == np.inf:
-            r = np.amax(np.abs(x))
+            r = np.amax(np.abs(x), keepdims=keepdims)
         elif porder == -np.inf:
-            r = np.amin(np.abs(x))
+            r = np.amin(np.abs(x), keepdims=keepdims)
         else:
-            r = np.linalg.norm(x, ord=porder)
+            r = np.linalg.norm(x, ord=porder, keepdims=keepdims)
     elif isinstance(axis, list or tuple) and len(axis) == 2:
         if porder == np.inf:
             axis = tuple(axis)
@@ -41,10 +41,10 @@ def p_norm(x, axis, porder, keepdims=False):
         elif porder == 0:
             axis = tuple(axis)
             r = x.astype(bool)
-            r = np.sum(r, axis)
+            r = np.sum(r, axis, keepdims=keepdims)
         elif porder == 1:
             axis = tuple(axis)
-            r = np.sum(np.abs(x), axis)
+            r = np.sum(np.abs(x), axis, keepdims=keepdims)
         else:
             axis = tuple(axis)
             xp = np.power(np.abs(x), porder)
@@ -61,7 +61,7 @@ def p_norm(x, axis, porder, keepdims=False):
 
 def frobenius_norm(x, axis=None, keepdims=False):
     if isinstance(axis, list): axis = tuple(axis)
-    if axis is None: axis = (-2, -1)
+    if axis is None: x = x.reshape(1, x.size)
     r = np.linalg.norm(
         x, ord='fro', axis=axis, keepdims=keepdims).astype(x.dtype)
     return r
@@ -217,28 +217,37 @@ class TestPnormOp5(TestPnormOp):
         self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
 
 
-def run_fro(self, p, axis, shape_x, dtype):
+def run_fro(self, p, axis, shape_x, dtype, keep_dim, check_dim=False):
     with fluid.program_guard(fluid.Program()):
         data = fluid.data(name="X", shape=shape_x, dtype=dtype)
-        out = paddle.norm(x=data, p=p, axis=axis)
+        out = paddle.norm(x=data, p=p, axis=axis, keepdim=keep_dim)
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         np_input = (np.random.rand(*shape_x) + 1.0).astype(dtype)
-        expected_result = frobenius_norm(np_input, axis=axis)
+        expected_result = frobenius_norm(np_input, axis=axis, keepdims=keep_dim)
         result, = exe.run(feed={"X": np_input}, fetch_list=[out])
     self.assertEqual((np.abs(result - expected_result) < 1e-6).all(), True)
+    if keep_dim and check_dim:
+        self.assertEqual(
+            (np.abs(np.array(result.shape) - np.array(expected_result.shape)) <
+             1e-6).all(), True)
 
 
-def run_pnorm(self, p, axis, shape_x, dtype):
+def run_pnorm(self, p, axis, shape_x, dtype, keep_dim, check_dim=False):
     with fluid.program_guard(fluid.Program()):
         data = fluid.data(name="X", shape=shape_x, dtype=dtype)
-        out = paddle.norm(x=data, p=p, axis=axis)
+        out = paddle.norm(x=data, p=p, axis=axis, keepdim=keep_dim)
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         np_input = (np.random.rand(*shape_x) + 1.0).astype(dtype)
-        expected_result = p_norm(np_input, porder=p, axis=axis).astype(dtype)
+        expected_result = p_norm(
+            np_input, porder=p, axis=axis, keepdims=keep_dim).astype(dtype)
         result, = exe.run(feed={"X": np_input}, fetch_list=[out])
-        self.assertEqual((np.abs(result - expected_result) < 1e-6).all(), True)
+    self.assertEqual((np.abs(result - expected_result) < 1e-6).all(), True)
+    if keep_dim and check_dim:
+        self.assertEqual(
+            (np.abs(np.array(result.shape) - np.array(expected_result.shape)) <
+             1e-6).all(), True)
 
 
 def run_graph(self, p, axis, shape_x, dtype):
@@ -253,6 +262,7 @@ def run_graph(self, p, axis, shape_x, dtype):
 
     # compute frobenius norm along last two dimensions.
     out_fro = paddle.norm(x, p='fro')
+    out_fro = paddle.norm(x, p='fro', axis=0)
     out_fro = paddle.norm(x, p='fro', axis=[0, 1])
     # compute 2-order  norm along [0,1] dimension.
     out_pnorm = paddle.norm(x, p=2, axis=[0, 1])
@@ -274,27 +284,133 @@ def run_graph(self, p, axis, shape_x, dtype):
 
 class API_NormTest(unittest.TestCase):
     def test_basic(self):
-        run_fro(self, p='fro', axis=None, shape_x=[2, 3, 4], dtype="float32")
-        run_fro(self, p='fro', axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(self, p=2, axis=None, shape_x=[3, 4], dtype="float32")
-        run_pnorm(self, p=2, axis=1, shape_x=[3, 4], dtype="float64")
-        run_pnorm(self, p=np.inf, axis=0, shape_x=[2, 3, 4], dtype="float32")
-        run_pnorm(self, p=np.inf, axis=None, shape_x=[2, 3, 4], dtype="float32")
-        run_pnorm(self, p=-np.inf, axis=0, shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(
-            self, p=-np.inf, axis=None, shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(self, p=0, axis=1, shape_x=[3, 4], dtype="float64")
-
-        run_pnorm(self, p=1, axis=1, shape_x=[3, 4], dtype="float64")
-        run_pnorm(self, p=0, axis=None, shape_x=[3, 4], dtype="float64")
-        run_pnorm(self, p=2, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(self, p=2, axis=-1, shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(self, p=1, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(self, p=0, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(
-            self, p=np.inf, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(
-            self, p=-np.inf, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
+        keep_dims = {False, True}
+        for keep in keep_dims:
+            run_fro(
+                self,
+                p='fro',
+                axis=None,
+                shape_x=[2, 3, 4],
+                dtype="float32",
+                keep_dim=keep)
+            run_fro(
+                self,
+                p='fro',
+                axis=[0, 1],
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=2,
+                axis=None,
+                shape_x=[3, 4],
+                dtype="float32",
+                keep_dim=keep)
+            run_pnorm(
+                self,
+                p=2,
+                axis=1,
+                shape_x=[3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=np.inf,
+                axis=0,
+                shape_x=[2, 3, 4],
+                dtype="float32",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=np.inf,
+                axis=None,
+                shape_x=[2, 3, 4],
+                dtype="float32",
+                keep_dim=keep)
+            run_pnorm(
+                self,
+                p=-np.inf,
+                axis=0,
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=-np.inf,
+                axis=None,
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep)
+            run_pnorm(
+                self,
+                p=0,
+                axis=1,
+                shape_x=[3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+
+            run_pnorm(
+                self,
+                p=1,
+                axis=1,
+                shape_x=[3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=0,
+                axis=None,
+                shape_x=[3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=2,
+                axis=[0, 1],
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=2,
+                axis=-1,
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=1,
+                axis=[0, 1],
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=np.inf,
+                axis=[0, 1],
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=-np.inf,
+                axis=[0, 1],
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
 
     def test_dygraph(self):
         run_graph(self, p='fro', axis=None, shape_x=[2, 3, 4], dtype="float32")
@@ -315,6 +431,7 @@ class API_NormTest(unittest.TestCase):
                 paddle.norm(data, p=p, out=out)
 
             self.assertRaises(TypeError, err_dtype, "fro", [2, 2], "int64")
+            self.assertRaises(ValueError, paddle.norm, "inf", [2], "int64")
             out = fluid.data(name="out", shape=[1], dtype="int64")
             self.assertRaises(TypeError, err_dtype, "fro", [2, 2], "float64",
                               out)
@@ -325,6 +442,7 @@ class API_NormTest(unittest.TestCase):
             self.assertRaises(ValueError, paddle.norm, data, p="unsupport norm")
             self.assertRaises(ValueError, paddle.norm, data, p=[1])
             self.assertRaises(ValueError, paddle.norm, data, p=[1], axis=-1)
+            self.assertRaises(ValueError, paddle.norm, 0, [1, 0], "float64")
             data = fluid.data(name="data_3d", shape=[2, 2, 2], dtype="float64")
             self.assertRaises(
                 ValueError, paddle.norm, data, p='unspport', axis=[-3, -2, -1])
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_api.py b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
index a77f1cdd57d7bade92e2a4f914dc3d91624d4845..505a1c738384194032329f66c33fa27e3ed3045c 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
@@ -165,7 +165,6 @@ class TestPool3d_API(unittest.TestCase):
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
     def check_max_dygraph_ndhwc_results(self, place):
-        print("run ndchw max pool3d")
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
             input = fluid.dygraph.to_variable(
@@ -190,7 +189,6 @@ class TestPool3d_API(unittest.TestCase):
                     np.transpose(result.numpy(), [0, 4, 1, 2, 3]), result_np))
 
     def check_max_dygraph_ceilmode_results(self, place):
-        print("run ceil mode max pool3d")
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
diff --git a/python/paddle/fluid/tests/unittests/test_roi_align_op.py b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
index b01863880866e247f2aee4b94ae3121c9d891f92..fb8a090b80700d9b884a72f7f430723754523a13 100644
--- a/python/paddle/fluid/tests/unittests/test_roi_align_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
@@ -181,16 +181,11 @@ class TestROIAlignInLodOp(TestROIAlignOp):
         self.calc_roi_align()
 
         seq_len = self.rois_lod[0]
-        cur_len = 0
-        lod = [cur_len]
-        for l in seq_len:
-            cur_len += l
-            lod.append(cur_len)
 
         self.inputs = {
             'X': self.x,
             'ROIs': (self.rois[:, 1:5], self.rois_lod),
-            'RoisLod': np.asarray(lod).astype('int64')
+            'RoisNum': np.asarray(seq_len).astype('int32')
         }
 
         self.attrs = {
diff --git a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
index 1200b0e3470f650dce4365ee46458c8184281292..c6622cf8d9ce8ae655a6b2e5c130ed9990fd2a5b 100644
--- a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
@@ -174,16 +174,11 @@ class TestROIPoolInLodOp(TestROIPoolOp):
         self.calc_roi_pool()
 
         seq_len = self.rois_lod[0]
-        cur_len = 0
-        lod = [cur_len]
-        for l in seq_len:
-            cur_len += l
-            lod.append(cur_len)
 
         self.inputs = {
             'X': self.x,
             'ROIs': (self.rois[:, 1:5], self.rois_lod),
-            'RoisLod': np.asarray(lod).astype('int64')
+            'RoisNum': np.asarray(seq_len).astype('int32')
         }
 
         self.attrs = {
diff --git a/python/paddle/fluid/tests/unittests/test_tril_triu_op.py b/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
index aed265b21b5781d88da0380b04872061e893d736..2cd2599f2ea2f4fb26b2d2730ca45384a3b664a7 100644
--- a/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
@@ -142,6 +142,18 @@ class TestTrilTriuOpAPI(unittest.TestCase):
             self.assertTrue(np.allclose(tril_out, np.tril(data)))
             self.assertTrue(np.allclose(triu_out, np.triu(data)))
 
+    def test_fluid_api(self):
+        data = np.random.random([1, 9, 9, 4]).astype('float32')
+        x = fluid.data(shape=[1, 9, -1, 4], dtype='float32', name='x')
+        triu_out = fluid.layers.triu(x)
+
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        triu_out = exe.run(fluid.default_main_program(),
+                           feed={"x": data},
+                           fetch_list=[triu_out])
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
index 716be1b539809ea3f90885b512f51ac45d85cd37..d388ba62f2a244f84497810739e5fd6b50f669d2 100644
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
 import numpy as np
+import numbers
 
 import paddle
 import paddle.nn as nn
@@ -107,6 +109,11 @@ def summary(net, input_size, batch_size=None, dtypes=None):
     if batch_size is None:
         batch_size = -1
 
+    if not paddle.in_dynamic_mode():
+        warnings.warn(
+            "Your model was created in static mode, this may not get correct summary information!"
+        )
+
     result, params_info = summary_string(net, _input_size, batch_size, dtypes)
     print(result)
 
@@ -121,16 +128,16 @@ def summary_string(model, input_size, batch_size=-1, dtypes=None):
 
     depth = len(list(model.sublayers()))
 
-    def register_hook(module):
-        def hook(module, input, output):
-            class_name = str(module.__class__).split(".")[-1].split("'")[0]
+    def register_hook(layer):
+        def hook(layer, input, output):
+            class_name = str(layer.__class__).split(".")[-1].split("'")[0]
 
             try:
-                module_idx = int(module._full_name.split('_')[-1])
+                layer_idx = int(layer._full_name.split('_')[-1])
             except:
-                module_idx = len(summary)
+                layer_idx = len(summary)
 
-            m_key = "%s-%i" % (class_name, module_idx + 1)
+            m_key = "%s-%i" % (class_name, layer_idx + 1)
             summary[m_key] = OrderedDict()
             summary[m_key]["input_shape"] = list(input[0].shape)
             summary[m_key]["input_shape"][0] = batch_size
@@ -142,23 +149,50 @@ def summary_string(model, input_size, batch_size=-1, dtypes=None):
                 summary[m_key]["output_shape"][0] = batch_size
 
             params = 0
-            if hasattr(module, "weight") and hasattr(module.weight, "shape"):
-                params += np.prod(module.weight.shape)
-                summary[m_key]["trainable"] = module.weight.trainable or (
-                    not module.weight.stop_gradient)
-            if hasattr(module, "bias") and hasattr(module.bias, "shape"):
-                params += np.prod(module.bias.shape)
+
+            if paddle.in_dynamic_mode():
+                layer_state_dict = layer._parameters
+            else:
+                layer_state_dict = layer.state_dict()
+
+            for k, v in layer_state_dict.items():
+                params += np.prod(v.shape)
+
+                try:
+                    if (getattr(getattr(layer, k), 'trainable')) and (
+                            not getattr(getattr(layer, k), 'stop_gradient')):
+                        summary[m_key]["trainable"] = True
+                    else:
+                        summary[m_key]["trainable"] = False
+                except:
+                    summary[m_key]["trainable"] = True
+
             summary[m_key]["nb_params"] = params
 
-        if (not isinstance(module, nn.Sequential) and
-                not isinstance(module, nn.LayerList) and
-            (not (module == model) or depth < 1)):
+        if (not isinstance(layer, nn.Sequential) and
+                not isinstance(layer, nn.LayerList) and
+            (not (layer == model) or depth < 1)):
+
+            hooks.append(layer.register_forward_post_hook(hook))
+
+    def _check_input_size(input_sizes):
+        for input_size in input_sizes:
+            for item in input_size:
+                if not isinstance(item, numbers.Number):
+                    raise TypeError(
+                        "Expected item in input size be a number, but got {}".
+                        format(type(item)))
 
-            hooks.append(module.register_forward_post_hook(hook))
+                if item <= 0:
+                    raise ValueError(
+                        "Expected item in input size greater than zero, but got {}".
+                        format(item))
 
     if isinstance(input_size, tuple):
         input_size = [input_size]
 
+    _check_input_size(input_size)
+
     x = [
         paddle.rand(
             [2] + list(in_size), dtype=dtype)
@@ -197,7 +231,12 @@ def summary_string(model, input_size, batch_size=-1, dtypes=None):
             "{0:,}".format(summary[layer]["nb_params"]), )
         total_params += summary[layer]["nb_params"]
 
-        total_output += np.prod(summary[layer]["output_shape"])
+        try:
+            total_output += np.prod(summary[layer]["output_shape"])
+        except:
+            for output_shape in summary[layer]["output_shape"]:
+                total_output += np.prod(output_shape)
+
         if "trainable" in summary[layer]:
             if summary[layer]["trainable"] == True:
                 trainable_params += summary[layer]["nb_params"]
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index bc48cc21c29e6683602f37fb3eab6c9485fe4977..0794b95c801011da6845eaf82c32a5428e0d5f41 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -113,17 +113,18 @@ def one_hot(x, num_classes, name=None):
 
 def embedding(x, weight, padding_idx=None, sparse=False, name=None):
     """
-    The operator is used to lookup embeddings vector of ids provided by :attr:`input` .
+    The operator is used to lookup embeddings vector of ids provided by :attr:`x` .
 
     The shape of output Tensor is generated by appending the last dimension of the input Tensor shape
     with embedding size.
-    **Note:** The id in :attr:`input` must satisfy :math:`0 =< id < weight.shape[0]` ,
+
+    **Note:** The id in :attr:`x` must satisfy :math:`0 =< id < weight.shape[0]` ,
     otherwise the program will throw an exception and exit.
 
     .. code-block:: text
 
         Case 1:
-            input is a Tensor. 
+            x is a Tensor.
                 padding_idx = -1
                 x.data = [[1, 3], [2, 4], [4, 127]]
                 x.shape = [3, 2]
@@ -138,7 +139,7 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
                             [0.0,         0.0,         ..., 0.0        ]]]  # padding data
 
             The input padding_idx is less than 0, it is automatically converted to padding_idx = -1 + 128 = 127
-            It will pad all-zero data when ids is 127.
+            It will pad all-zero data when id is 127.
 
     Args:
         x(Tensor): A Tensor with type int32/int64, which contains the id information. The value of the input id should
@@ -151,10 +152,10 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
             such as :ref:`api_optimizer_AdadeltaOptimizer` , :ref:`api_optimizer_AdamaxOptimizer` ,
             :ref:`api_optimizer_DecayedAdagradOptimizer` , :ref:`api_optimizer_FtrlOptimizer` ,
             :ref:`api_optimizer_LambOptimizer` and :ref:`api_optimizer_LarsMomentumOptimizer` .
-            In these cases, is_sparse must be False. Default: False.
-        padding_idx(int|long|None): padding_idx needs to be in the interval [-vocab_size, vocab_size).
+            In these cases, sparse must be False. Default: False.
+        padding_idx(int|long|None): padding_idx needs to be in the interval [-weight.shape[0], weight.shape[0]).
             If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
-            to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup
+            to :math:`weight.shape[0] + padding\_idx` . It will output all-zero padding data whenever lookup
             encounters :math:`padding\_idx` in id. And the padding data will not be updated while training.
             If set None, it makes no effect to output. Default: None.
         name(str|None): For detailed information, please refer
@@ -162,7 +163,7 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
            None by default.
 
     Returns:
-        Tensor: Embedding Tensor  mapped by input. The data type is the same as :attr:`weight`.
+        Tensor: Embedding Tensor  mapped by x. The data type is the same as :attr:`weight`.
 
     Examples:
 
@@ -209,6 +210,10 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
         padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
             weight.shape[0] + padding_idx)
 
+        if padding_idx >= weight.shape[0] or padding_idx < -weight.shape[0]:
+            raise ValueError("padding_idx must be within [-{}, {})".format(
+                weight.shape[0], weight.shape[0]))
+
         helper.append_op(
             type='lookup_table_v2',
             inputs={'Ids': x,
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 3d5894064c44cb72259472fc638d46b67c5703fc..6c139b0ddbbb996145e3a611839bf5e2e113f3cd 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -780,10 +780,10 @@ def kl_div(input, label, reduction='mean', name=None):
             input = np.random.uniform(-10, 10, shape).astype('float32')
             target = np.random.uniform(-10, 10, shape).astype('float32')
 
-            # 'batchmean' reduction, loss shape will be [N]
+            # 'batchmean' reduction, loss shape will be [1]
             pred_loss = F.kl_div(paddle.to_tensor(input),
                                  paddle.to_tensor(target), reduction='batchmean')
-            # shape=[5]
+            # shape=[1]
 
             # 'mean' reduction, loss shape will be [1]
             pred_loss = F.kl_div(paddle.to_tensor(input),
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 662205ab69550255406ff5edfda4556b73b98843..042625a3dbd6b07487d6f77442621959f7492af6 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -389,7 +389,7 @@ def avg_pool3d(x,
                stride=None,
                padding=0,
                ceil_mode=False,
-               count_include_pad=False,
+               count_include_pad=True,
                divisor_override=None,
                data_format="NCDHW",
                name=None):
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 1dfdac26e990851ac5f192742acd47fb92633d0d..a74a98d5ed45b9f613b0f2f6d5f04544ffae3d2a 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -249,7 +249,7 @@ def grid_sample(x,
         mode(str, optional): The interpolation method which can be 'bilinear' or 'nearest'.
                          Default: 'bilinear'.
         padding_mode(str, optional) The padding method used when source index
-                   is out of input images. It can be 'zeros', 'reflect' and 'border'.
+                   is out of input images. It can be 'zeros', 'reflection' and 'border'.
                    Default: zeros.
         align_corners(bool, optional): If `align_corners` is true, it will projects
                    -1 and 1 to the centers of the corner pixels. Otherwise, it will
@@ -312,7 +312,7 @@ def grid_sample(x,
     if not isinstance(grid, Variable):
         raise ValueError("The grid should be a Variable")
     _modes = ['bilinear', 'nearest']
-    _padding_modes = ['zeros', 'reflect', 'border']
+    _padding_modes = ['zeros', 'reflection', 'border']
     if mode not in _modes:
         raise ValueError(
             "The mode of grid sample function should be in {}, but got: {}".
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index a1923542c40828b0886e9f5ab1f97e04e94fec92..433443fee1765a3ecd4cf0bbe53a960bbeaefc71 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -1564,22 +1564,18 @@ class CosineSimilarity(layers.Layer):
 
 class Embedding(layers.Layer):
     """
-    :alias_main: paddle.nn.Embedding
-	:alias: paddle.nn.Embedding,paddle.nn.layer.Embedding,paddle.nn.layer.common.Embedding
-	:old_api: paddle.fluid.dygraph.Embedding
-
     **Embedding Layer**
 
     This interface is used to construct a callable object of the ``Embedding`` class.
     For specific usage, refer to code examples. It implements the function of the Embedding Layer.
-    This layer is used to lookup embeddings vector of ids provided by :attr:`input` .
+    This layer is used to lookup embeddings vector of ids provided by :attr:`x` .
     It automatically constructs a 2D embedding matrix based on the
-    input :attr:`size` (vocab_size, emb_size) and :attr:`dtype` .
+    input :attr:`num_embeddings` and attr:`embedding_dim`.
 
     The shape of output Tensor is generated by appending an emb_size dimension to the
     last dimension of the input Tensor shape.
 
-    **Note:** The id in :attr:`input` must satisfy :math:`0 =< id < size[0]` ,
+    **Note:** The id in :attr:`x` must satisfy :math:`0 =< id < num_embeddings` ,
     otherwise the program will throw an exception and exit.
 
     .. code-block:: text
@@ -1607,7 +1603,7 @@ class Embedding(layers.Layer):
         num_embeddings (int): Just one element which indicate the size
             of the dictionary of embeddings.
         embedding_dim:  Just one element which indicate the size of each embedding vector respectively.
-        padding_idx(int|long|None): padding_idx needs to be in the interval [-vocab_size, vocab_size).
+        padding_idx(int|long|None): padding_idx needs to be in the interval [-num_embeddings, num_embeddings).
             If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
             to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup
             encounters :math:`padding\_idx` in id. And the padding data will not be updated while training.
@@ -1618,13 +1614,13 @@ class Embedding(layers.Layer):
             such as :ref:`api_optimizer_AdadeltaOptimizer` , :ref:`api_optimizer_AdamaxOptimizer` ,
             :ref:`api_optimizer_DecayedAdagradOptimizer` , :ref:`api_optimizer_FtrlOptimizer` ,
             :ref:`api_optimizer_LambOptimizer` and :ref:`api_optimizer_LarsMomentumOptimizer` .
-            In these case, is_sparse must be False. Default: False.
+            In these case, sparse must be False. Default: False.
         weight_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the
-            default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` . In addition,
+            default weight parameter property is used. See usage for details in :ref:`api_ParamAttr` . In addition,
             user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter.
             The local word vector needs to be transformed into numpy format, and the shape of local word
-            vector should be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer`
-            is used to load custom or pre-trained word vectors. See code example 2 for details.
+            vector should be consistent with :attr:`num_embeddings` . Then :ref:`api_initializer_NumpyArrayInitializer`
+            is used to load custom or pre-trained word vectors. See code example for details.
         name(str|None): For detailed information, please refer
                to :ref:`api_guide_Name`. Usually name is no need to set and
                None by default.
@@ -1639,20 +1635,34 @@ class Embedding(layers.Layer):
 
         .. code-block:: python
 
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
-          paddle.disable_static()
+            import paddle
+            import numpy as np
+
+            x_data = np.arange(3, 6).reshape((3, 1)).astype(np.int64)
+            y_data = np.arange(6, 12).reshape((3, 2)).astype(np.float32)
+            paddle.disable_static(paddle.CPUPlace())
+            x = paddle.to_tensor(x_data, stop_gradient=False)
+            y = paddle.to_tensor(y_data, stop_gradient=False)
+
+            embedding = paddle.nn.Embedding(10, 3, sparse=True)
+
+            w0=np.full(shape=(10, 3), fill_value=2).astype(np.float32)
+            embedding.weight.set_value(w0)
 
-          # example 1
-          inp_word = np.array([[2, 3, 5], [4, 2, 1]]).astype('int64')
-          inp_word.shape  # [2, 3]
-          dict_size = 20
+            adam = paddle.optimizer.Adam(parameters=[embedding.weight], learning_rate=0.01)
+            adam.clear_grad()
+
+            # weight.shape = [10, 3]
+
+            # x.data = [[3],[4],[5]]
+            # x.shape = [3, 1]
+
+            # out.data = [[2,2,2], [2,2,2], [2,2,2]]
+            # out.shape = [3, 1, 3]
+            out=embedding(x)
+            out.backward()
+            adam.step()
 
-          emb = nn.Embedding(
-                    dict_size,
-                    32,
-                    sparse=False)
     """
 
     def __init__(self,
@@ -1669,13 +1679,24 @@ class Embedding(layers.Layer):
         self._is_distributed = False
         self._padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
             num_embeddings + padding_idx)
+
+        if self._num_embeddings <= 0:
+            raise ValueError("num_embeddings must be gather than 0")
+
+        if self._embedding_dim <= 0:
+            raise ValueError("embedding_dim must be gather than 0")
+
+        if self._padding_idx >= num_embeddings or self._padding_idx < -num_embeddings:
+            raise ValueError("padding_idx must be within [-{}, {})".format(
+                num_embeddings, num_embeddings))
+
         self._dtype = self._helper.get_default_dtype()
         self._size = [self._num_embeddings, self._embedding_dim]
 
         self._weight_attr = weight_attr
         self._remote_prefetch = False
         self._name = name
-        self._weight = self.create_parameter(
+        self.weight = self.create_parameter(
             attr=self._weight_attr,
             shape=self._size,
             dtype=self._dtype,
@@ -1684,7 +1705,7 @@ class Embedding(layers.Layer):
     def forward(self, x):
         return F.embedding(
             x,
-            weight=self._weight,
+            weight=self.weight,
             padding_idx=self._padding_idx,
             sparse=self._sparse,
             name=self._name)
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index a60e615d5064bf4ef2229dd67193774030383888..271dc9b4e685ce06cdb12ccdcb6bb0704a5ef2a1 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -627,10 +627,13 @@ class KLDivLoss(fluid.dygraph.Layer):
     $$l(x, y) = y * (\log(y) - x)$$
 
     Parameters:
-        reduction (str, optional): Indicate how to average the loss,
-            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
-            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
-            Default is ``'mean'``.
+        reduction (Tensor): Indicate how to average the loss,
+             the candicates are ``'none'`` | ``'batchmean'`` | ``'mean'`` | ``'sum'``.
+             If `reduction` is ``'mean'``, the reduced mean loss is returned;
+             If `reduction` is ``'batchmean'``, the sum loss divided by batch size is returned;
+             if `reduction` is ``'sum'``, the reduced sum loss is returned;
+             if `reduction` is ``'none'``, no reduction will be apllied.
+             Default is ``'mean'``.
 
     Shape:
 
@@ -654,11 +657,11 @@ class KLDivLoss(fluid.dygraph.Layer):
             x = np.random.uniform(-10, 10, shape).astype('float32')
             target = np.random.uniform(-10, 10, shape).astype('float32')
 
-            # 'batchmean' reduction, loss shape will be [N]
+            # 'batchmean' reduction, loss shape will be [1]
             kldiv_criterion = nn.KLDivLoss(reduction='batchmean')
             pred_loss = kldiv_criterion(paddle.to_tensor(x),
                                         paddle.to_tensor(target))
-            # shape=[5]
+            # shape=[1]
 
             # 'mean' reduction, loss shape will be [1]
             kldiv_criterion = nn.KLDivLoss(reduction='mean')
@@ -684,7 +687,7 @@ class KLDivLoss(fluid.dygraph.Layer):
         self.reduction = reduction
 
     def forward(self, input, label):
-        out = paddle.nn.functional.kl_div(input, label, self.reduction)
+        out = F.kl_div(input, label, self.reduction)
         return out
 
 
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 7ddda5091a0a260f56b29bcedfdcb0786e82ddd6..67e3ce21ffba0c312eb01163cdf32f87c6433ee1 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -183,12 +183,13 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
         x (Tensor): The input tensor could be N-D tensor, and the input data
             type could be float32 or float64.
         p (float|string, optional): Order of the norm. Supported values are `fro`, `0`, `1`, `2`,
-           `inf`,`-inf` and any positive real number yielding the corresponding p-norm.
-            Not supported: ord < 0, nuclear norm.
+            `inf`, `-inf` and any positive real number yielding the corresponding p-norm. Not supported: ord < 0 and nuclear norm. 
+            Default value is `fro`.
         axis (int|list|tuple, optional): The axis on which to apply norm operation. If axis is int
             or list(int)/tuple(int)  with only one element, the vector norm is computed over the axis.
             If `axis < 0`, the dimension to norm operation is rank(input) + axis.
             If axis is a list(int)/tuple(int) with two elements, the matrix norm is computed over the axis.
+            Defalut value is `None`.
         keepdim (bool, optional): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have fewer dimension
             than the :attr:`input` unless :attr:`keepdim` is true, default
@@ -197,13 +198,9 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
             user to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Variable: Tensor, results of norm operation on the specified axis of input tensor,
+        Tensor: results of norm operation on the specified axis of input tensor,
         it's data type is the same as input's Tensor.
  
-    Raises:
-        TypeError, if out data type is different with the input data type.
-        ValueError, If `p` or `axis` is invalid.
-    
     Examples:
         .. code-block:: python
             
@@ -256,15 +253,13 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
                 "The dim of frobenius norm op should be None or two elements list!"
             )
         if in_dygraph_mode():
-            if dim is None: dim = [-1]
-            return core.ops.frobenius_norm(input, 'dim', dim, 'keepdim',
-                                           keepdim)
-        attrs = {
-            'dim': dim if dim != None else [-2, -1],
-            'keep_dim': keepdim,
-            'reduce_all': False
-        }
-        if len(attrs['dim']) == len(input.shape):
+            if dim is None:
+                return core.ops.frobenius_norm(input, 'keep_dim', keepdim,
+                                               'reduce_all', True)
+            return core.ops.frobenius_norm(input, 'dim', dim, 'keep_dim',
+                                           keepdim, 'reduce_all', False)
+        attrs = {'dim': dim, 'keep_dim': keepdim, 'reduce_all': False}
+        if dim is None:
             attrs['reduce_all'] = True
         check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                                  'frobenius_norm')
@@ -351,42 +346,6 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
 
         return reduce_out
 
-    def p0_matrix_norm(input, porder=0., axis=axis, keepdim=False, name=None):
-        block = LayerHelper('norm', **locals())
-        out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
-
-        cast_out = block.create_variable_for_type_inference(dtype=bool)
-        block.append_op(
-            type='cast',
-            inputs={'X': input},
-            outputs={'Out': cast_out},
-            attrs={
-                'in_dtype': input.dtype,
-                'out_dtype': int(core.VarDesc.VarType.BOOL)
-            })
-        cast_out2 = block.create_variable_for_type_inference(dtype=bool)
-        block.append_op(
-            type='cast',
-            inputs={'X': cast_out},
-            outputs={'Out': cast_out2},
-            attrs={
-                'in_dtype': cast_out.dtype,
-                'out_dtype': int(core.VarDesc.VarType.FP32)
-            })
-        sum_out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
-        block.append_op(
-            type='reduce_sum',
-            inputs={'X': cast_out2},
-            outputs={'Out': sum_out},
-            attrs={
-                'dim': axis,
-                'keep_dim': keepdim,
-                'reduce_all': True if axis is None else False
-            })
-        return sum_out
-
     def p_matrix_norm(input, porder=1., axis=axis, keepdim=False, name=None):
         block = LayerHelper('norm', **locals())
         out = block.create_variable_for_type_inference(
@@ -448,7 +407,20 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
 
     #calculate vector norm, where axis is int or list with only one integer
     if isinstance(axis, int):
-        if isinstance(p, (int, float)):
+        if isinstance(p, str):
+            if p == "fro":
+                return vector_norm(
+                    x,
+                    porder=2,
+                    axis=axis,
+                    keepdim=keepdim,
+                    asvector=False,
+                    name=name)
+
+            else:
+                raise ValueError(
+                    "only valid string values are 'fro', found {}".format(p))
+        elif isinstance(p, (int, float)):
             return vector_norm(
                 x,
                 axis=axis,
@@ -464,10 +436,12 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
     elif isinstance(axis, list) and len(axis) == 2:
         if p == "fro":
             return frobenius_norm(x, dim=axis, keepdim=keepdim, name=name)
-        elif p == 0:
-            return p0_matrix_norm(x, axis=axis, keepdim=keepdim, name=name)
         elif p == np.inf or p == -np.inf:
             return inf_norm(x, porder=p, axis=axis, keepdim=keepdim, name=name)
+        elif p == 0:
+            raise ValueError(
+                "just suport axis type int or list (length of list <=1) if p = 0, found {}".
+                format(axis))
         else:
             return p_matrix_norm(
                 x, porder=p, axis=axis, keepdim=keepdim, name=name)
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index b7b5d44650f8d62926241a57feedfd5b932a37f5..5c4e98feaa686217bc78ad3915423593ad4fcdce 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -523,6 +523,24 @@ class TestModelFunction(unittest.TestCase):
             model.summary(input_size=[(20)])
             model.summary(input_size=(20), batch_size=2)
 
+    def test_summary_nlp(self):
+        paddle.enable_static()
+        nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
+        paddle.summary(nlp_net, (1, 2))
+
+    def test_summary_error(self):
+        with self.assertRaises(TypeError):
+            nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
+            paddle.summary(nlp_net, (1, '2'))
+
+        with self.assertRaises(ValueError):
+            nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
+            paddle.summary(nlp_net, (-1, -1))
+
+        paddle.disable_static()
+        nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
+        paddle.summary(nlp_net, (1, 2))
+
     def test_export_deploy_model(self):
         for dynamic in [True, False]:
             fluid.enable_dygraph() if dynamic else None
diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3b6a7546616678c21d4241da42f0965d4b85f2e6
--- /dev/null
+++ b/python/unittest_py/requirements.txt
@@ -0,0 +1,4 @@
+PyGithub
+coverage
+pycrypto
+mock
diff --git a/tools/coverage/coverage_diff.py b/tools/coverage/coverage_diff.py
index 051348d358f459a67d39dd9ca798721a82aa2233..38f671fe4089d1f94caafaf26640e4df75870f55 100644
--- a/tools/coverage/coverage_diff.py
+++ b/tools/coverage/coverage_diff.py
@@ -90,12 +90,12 @@ def get_info_file_lines(info_file, diff_file):
                 continue
 
             elif line.startswith('LF:'):
-                print 'LF:{}'.format(current_lf)
+                print('LF:{}'.format(current_lf))
 
                 continue
 
             elif line.startswith('LH:'):
-                print 'LH:{}'.format(current_lh)
+                print('LH:{}'.format(current_lh))
 
                 continue
 
diff --git a/tools/coverage/coverage_diff_list.py b/tools/coverage/coverage_diff_list.py
index 57222da4d9818ebbcb0b9aeea6937494038eecdf..8975185edadfbd567a428bbd90523923f5ab675d 100644
--- a/tools/coverage/coverage_diff_list.py
+++ b/tools/coverage/coverage_diff_list.py
@@ -40,7 +40,7 @@ def filter_by(list_file, max_rate):
             except:
                 pass
 
-            print name, rate
+            print(name, rate)
 
 
 if __name__ == '__main__':
diff --git a/tools/coverage/coverage_lines.py b/tools/coverage/coverage_lines.py
index eb846cc9f249a3ff177fde437362a1122f4409a5..cdec5b8b1bb1873f8b9ef761e9d8575c89fee234 100644
--- a/tools/coverage/coverage_lines.py
+++ b/tools/coverage/coverage_lines.py
@@ -33,7 +33,7 @@ def get_lines(info_file):
                 hits += 1
 
     if total == 0:
-        print 'no data found'
+        print('no data found')
         exit()
 
     return hits / total
@@ -47,17 +47,17 @@ if __name__ == '__main__':
     expected = float(sys.argv[2])
 
     if not os.path.isfile(info_file):
-        print 'info file {} is not exists, ignored'.format(info_file)
+        print('info file {} is not exists, ignored'.format(info_file))
         exit()
 
     actual = get_lines(info_file)
     actual = round(actual, 3)
 
     if actual < expected:
-        print 'expected >= {} %, actual {} %, failed'.format(
-            round(expected * 100, 1), round(actual * 100, 1))
+        print('expected >= {} %, actual {} %, failed'.format(
+            round(expected * 100, 1), round(actual * 100, 1)))
 
         exit(1)
 
-    print 'expected >= {} %, actual {} %, passed'.format(
-        round(expected * 100, 1), round(actual * 100, 1))
+    print('expected >= {} %, actual {} %, passed'.format(
+        round(expected * 100, 1), round(actual * 100, 1)))
diff --git a/tools/coverage/paddle_coverage.sh b/tools/coverage/paddle_coverage.sh
index d54434b738db5b5e6192f7d3bf9e48edbebc5b7c..008b35d01ca565a6d32265f595dd2d6aa55be707 100644
--- a/tools/coverage/paddle_coverage.sh
+++ b/tools/coverage/paddle_coverage.sh
@@ -5,7 +5,7 @@ set -xe
 PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
 
 # install lcov
-curl -o /lcov-1.14.tar.gz -x "" -s https://paddle-ci.gz.bcebos.com/coverage/lcov-1.14.tar.gz
+curl -o /lcov-1.14.tar.gz -x "" -s https://paddle-ci.gz.bcebos.com/coverage/lcov-1.14.tar.gz || exit 101
 tar -xf /lcov-1.14.tar.gz -C /
 cd /lcov-1.14
 make install
@@ -14,7 +14,7 @@ make install
 
 cd /paddle/build
 
-python ${PADDLE_ROOT}/tools/coverage/gcda_clean.py ${GIT_PR_ID}
+python3 ${PADDLE_ROOT}/tools/coverage/gcda_clean.py ${GIT_PR_ID}
 
 lcov --capture -d ./ -o coverage.info --rc lcov_branch_coverage=0
 
@@ -53,9 +53,9 @@ gen_full_html_report || true
 function gen_diff_html_report() {
     if [ "${GIT_PR_ID}" != "" ]; then
 
-        COVERAGE_DIFF_PATTERN="`python ${PADDLE_ROOT}/tools/coverage/pull_request.py files ${GIT_PR_ID}`"
+        COVERAGE_DIFF_PATTERN="`python3 ${PADDLE_ROOT}/tools/coverage/pull_request.py files ${GIT_PR_ID}`"
 
-        python ${PADDLE_ROOT}/tools/coverage/pull_request.py diff ${GIT_PR_ID} > git-diff.out
+        python3 ${PADDLE_ROOT}/tools/coverage/pull_request.py diff ${GIT_PR_ID} > git-diff.out
     fi
 
     lcov --extract coverage-full.info \
@@ -63,7 +63,7 @@ function gen_diff_html_report() {
         -o coverage-diff.info \
         --rc lcov_branch_coverage=0
 
-    python ${PADDLE_ROOT}/tools/coverage/coverage_diff.py coverage-diff.info git-diff.out > coverage-diff.tmp
+    python3 ${PADDLE_ROOT}/tools/coverage/coverage_diff.py coverage-diff.info git-diff.out > coverage-diff.tmp
 
     mv -f coverage-diff.tmp coverage-diff.info
 
@@ -82,7 +82,7 @@ set -x
 
 coverage xml -i -o python-coverage.xml
 
-python ${PADDLE_ROOT}/tools/coverage/python_coverage.py > python-coverage.info
+python3 ${PADDLE_ROOT}/tools/coverage/python_coverage.py > python-coverage.info
 
 # python full html report
 #
@@ -143,5 +143,6 @@ echo "Assert Python Diff Coverage"
 python ${PADDLE_ROOT}/tools/coverage/coverage_lines.py python-coverage-diff.info 0.9 || PYTHON_COVERAGE_LINES_ASSERT=1
 
 if [ "$COVERAGE_LINES_ASSERT" = "1" ] || [ "$PYTHON_COVERAGE_LINES_ASSERT" = "1" ]; then
+    echo "exit 9" > /tmp/paddle_coverage.result
     exit 9
 fi
diff --git a/tools/coverage/pull_request.py b/tools/coverage/pull_request.py
index 979f476d2a1f3fbf28d43e2b717deb7b2d1b0fff..105460032f7db538eaf7a193776bf8085e2837a1 100644
--- a/tools/coverage/pull_request.py
+++ b/tools/coverage/pull_request.py
@@ -40,7 +40,7 @@ def get_files(args):
     pull = get_pull(args.pull_id)
 
     for file in pull.get_files():
-        print '/paddle/{}'.format(file.filename)
+        print('/paddle/{}'.format(file.filename))
 
 
 def diff(args):
@@ -55,8 +55,8 @@ def diff(args):
     pull = get_pull(args.pull_id)
 
     for file in pull.get_files():
-        print '+++ {}'.format(file.filename)
-        print file.patch
+        print('+++ {}'.format(file.filename))
+        print(file.patch)
 
 
 if __name__ == '__main__':
diff --git a/tools/coverage/python_coverage.py b/tools/coverage/python_coverage.py
index ba67e12249bb0ccee608ea120321eaa3a2ccbc91..8ad9d85c1bf6b5ed542fb8469173c4f1815050a4 100644
--- a/tools/coverage/python_coverage.py
+++ b/tools/coverage/python_coverage.py
@@ -12,10 +12,7 @@ root = tree.getroot()
 
 sources = root.findall('sources/source')
 
-if len(sources) > 1:
-    exit(1)
-
-source = sources[0].text
+source = sources[-1].text
 
 for clazz in root.findall('packages/package/classes/class'):
     clazz_filename = clazz.attrib.get('filename')
@@ -28,8 +25,8 @@ for clazz in root.findall('packages/package/classes/class'):
     if not path.exists(clazz_filename):
         continue
 
-    print 'TN:'
-    print 'SF:{}'.format(clazz_filename)
+    print('TN:')
+    print('SF:{}'.format(clazz_filename))
 
     branch_index = 0
 
@@ -50,16 +47,16 @@ for clazz in root.findall('packages/package/classes/class'):
             taken = int(taken)
 
             for _ in range(taken):
-                print 'BRDA:{},{},{},{}'.format(line_number, 0, branch_index,
-                                                line_hits)
+                print('BRDA:{},{},{},{}'.format(line_number, 0, branch_index,
+                                                line_hits))
                 branch_index += 1
 
             if line_missing_branches:
                 for missing_branch in line_missing_branches.split(','):
-                    print 'BRDA:{},{},{},{}'.format(line_number, 0,
-                                                    branch_index, 0)
+                    print('BRDA:{},{},{},{}'.format(line_number, 0,
+                                                    branch_index, 0))
                     branch_index += 1
 
-        print 'DA:{},{}'.format(line_number, line_hits)
+        print('DA:{},{}'.format(line_number, line_hits))
 
-    print 'end_of_record'
+    print('end_of_record')