diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index b7a93cd9ee2160090c0142d62d96da72e4c58717..7a94bda0f5f73e48081f68d7b2730e3df1e46232 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -16,6 +16,7 @@ else()
   set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
   set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70")
   set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75")
+  set(paddle_known_gpu_archs11 "52 60 61 70 75 80")
 endif()
 
 ######################################################################################
@@ -188,6 +189,10 @@ elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs10})
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
+elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs11})
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
 endif()
 
 add_definitions("-DPADDLE_CUDA_BINVER=\"${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}\"")
diff --git a/cmake/external/dgc.cmake b/cmake/external/dgc.cmake
index 8472a0743b91e69d823ed62f94b55045a31aaabc..bc8611f3862cd14c0de493564ea82a1c9ce66667 100644
--- a/cmake/external/dgc.cmake
+++ b/cmake/external/dgc.cmake
@@ -19,7 +19,7 @@ SET(DGC_SOURCES_DIR "${THIRD_PARTY_PATH}/dgc/src/extern_dgc")
 SET(DGC_INSTALL_DIR "${THIRD_PARTY_PATH}/install/dgc")
 SET(DGC_INCLUDE_DIR "${DGC_INSTALL_DIR}/include" CACHE PATH "dgc include directory." FORCE)
 SET(DGC_LIBRARIES   "${DGC_INSTALL_DIR}/lib/libdgc.a" CACHE FILEPATH "dgc library." FORCE)
-SET(DGC_URL         "http://fleet.bj.bcebos.com/collective_ef2216a.tgz")
+SET(DGC_URL         "https://fleet.bj.bcebos.com/dgc/collective_f66ef73.tgz")
 INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR})
 
 cache_third_party(extern_dgc
@@ -30,7 +30,7 @@ ExternalProject_Add(
     extern_dgc
     ${EXTERNAL_PROJECT_LOG_ARGS}
     "${DGC_DOWNLOAD_CMD}"
-    URL_MD5         "2f67549fd5f1262383d83289abc4f88f"
+    URL_MD5         "94e6fa1bc97169d0e1aad44570fe3251"
     PREFIX          "${DGC_PREFIX_DIR}"
     SOURCE_DIR      "${DGC_SOURCES_DIR}"
     CONFIGURE_COMMAND ""
diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake
index 8a655b2954dea5d6b864616ed2f4d19b167c4be8..3da550519bae2a12139873a2a84680debbaa8f4c 100644
--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
@@ -34,7 +34,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
   set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)
 
   if(NOT LITE_GIT_TAG)
-    set(LITE_GIT_TAG dfdfa6440c83bf0b415f9f5a9ff84842ce0bb0fa)
+    set(LITE_GIT_TAG 6d2b2a4028a58715b01887b04eb9bff8432eb184)
   endif()
 
   if(NOT CUDA_ARCH_NAME)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index ae870b766fc3349ea53628e14c68ab9a5826213f..c0adda0da31ae1e7425ddfb352971444c09d5615 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -19,8 +19,8 @@ SET(MKLDNN_PREFIX_DIR     ${THIRD_PARTY_PATH}/mkldnn)
 SET(MKLDNN_SOURCE_DIR     ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
-SET(MKLDNN_REPOSITORY     https://github.com/intel/mkl-dnn.git)
-SET(MKLDNN_TAG            1ea812f4f5aa1bd989372a23ab50d0f0f81ee677)
+SET(MKLDNN_REPOSITORY     https://github.com/oneapi-src/oneDNN.git)
+SET(MKLDNN_TAG            64a48f9565aa72f6359917b3406328075a409939)
 
 # Introduce variables:
 # * CMAKE_INSTALL_LIBDIR
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 0f6b1c182d5590354c8a970eea339a3e23846f39..ac6cf624e82c0a346fea42fa29fe9bab6ace8d47 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -18,7 +18,7 @@ SET(WARPCTC_PREFIX_DIR  ${THIRD_PARTY_PATH}/warpctc)
 SET(WARPCTC_SOURCE_DIR  ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
 set(WARPCTC_REPOSITORY  https://github.com/baidu-research/warp-ctc.git)
-set(WARPCTC_TAG         bc29dcfff07ced1c7a19a4ecee48e5ad583cef8e)
+set(WARPCTC_TAG         fc7f226b93758216a03b1be9d24593a12819b984)
 
 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
     CACHE PATH "Warp-ctc Directory" FORCE)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 9d07a0979d9392c9b2ab78562f8e0ceb8fc5d722..415e07c75425345f5f1ad29a8544e02a5bfb12e4 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -28,7 +28,15 @@ function(CheckCompilerCXX11Flag)
 endfunction()
 
 CheckCompilerCXX11Flag()
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+if (WITH_GPU)
+    if (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.0)
+       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
+    else()
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+    endif()
+else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+endif()
 # safe_set_flag
 #
 # Set a compile flag only if compiler is support
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 1956e5c39ea2524d8a8e2650eb08f8d58f410b73..b0a6dfe29020781e57d57861137861366864abdb 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -386,7 +386,7 @@ function(cc_test_run TARGET_NAME)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
     # No unit test should exceed 2 minutes.
     if (APPLE OR WIN32)
-        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
     else()
         set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
     endif()
@@ -748,7 +748,7 @@ function(py_test TARGET_NAME)
     endif()
     
     if (APPLE OR WIN32)
-        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
     else()
         # No unit test should exceed 2 minutes in Linux.
         set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index f60a6dc3f0c89dd345b04ea3a1e213de770e5760..aea972ab3db2af862f5230ea6c1eabeed8b611c5 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -138,12 +138,17 @@ function(op_library TARGET)
     # And for detail pybind information, please see generated paddle/pybind/pybind.h.
     file(READ ${TARGET}.cc TARGET_CONTENT)
     string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
-    string(REGEX MATCH "REGISTER_OPERATOR\\([a-z0-9_]*," one_register "${multi_register}")
+    # [ \t\r\n]* is used for blank characters
+    string(REGEX MATCH "REGISTER_OPERATOR\\([ \t\r\n]*[a-z0-9_]*," one_register "${multi_register}")
+
     if (one_register STREQUAL "")
         string(REPLACE "_op" "" TARGET "${TARGET}")
     else ()
         string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}")
         string(REPLACE "," "" TARGET "${TARGET}")
+        # [ \t\r\n]+ is used for blank characters.
+        # Here we use '+' instead of '*' since it is a REPLACE operation.
+        string(REGEX REPLACE "[ \t\r\n]+" "" TARGET "${TARGET}")
     endif()
 
     # pybind USE_NO_KERNEL_OP
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index c9442e8f843ac152cac02908799a8d24f5951e58..9edfcb967abc26a25a94d368298c1c475295019f 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -243,9 +243,10 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
 ENDIF()
 
 if(WITH_GPU)
-    include(external/cub)       # download cub
-    list(APPEND third_party_deps extern_cub)
-  
+    if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+        include(external/cub)       # download cub
+        list(APPEND third_party_deps extern_cub)
+    endif()
     set(CUDAERROR_URL  "http://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE)
     file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage
 endif(WITH_GPU)
diff --git a/paddle/fluid/framework/c/c_api.cc b/paddle/fluid/framework/c/c_api.cc
index ab987fb56686594f505e63b6664c2176e5a4ad89..0dd2768ccb9ffa1dc7b85dca500095f8c10479c3 100644
--- a/paddle/fluid/framework/c/c_api.cc
+++ b/paddle/fluid/framework/c/c_api.cc
@@ -49,7 +49,8 @@ std::vector<std::string> PD_GetGradOpDescStrs(
     for (size_t i = 0; i < op_num; ++i) {
       PADDLE_ENFORCE_EQ(
           grad_op_descs[i]->Proto()->SerializePartialToString(&ret[i]), true,
-          "Cannot serialize message.");
+          paddle::platform::errors::Unavailable(
+              "Cannot serialize operator desc message."));
     }
   }
   return ret;
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
old mode 100755
new mode 100644
index 551d1342edeb335d1cad4782f85ae9f94f8739bd..edd1700ae7284c77883af6abd2cd7d511097685f
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -36,7 +36,10 @@ message AMPConfig {
   repeated string custom_black_varnames = 9;
 }
 
-message LocalSGDConfig { optional int32 k_steps = 1 [ default = 4 ]; }
+message LocalSGDConfig {
+  optional int32 k_steps = 1 [ default = 1 ];
+  optional int32 begin_step = 2 [ default = 1 ];
+}
 
 message GradientMergeConfig {
   optional int32 k_steps = 1 [ default = 1 ];
@@ -52,6 +55,8 @@ message DGCConfig {
 message LarsConfig {
   optional float lars_coeff = 1 [ default = 0.001 ];
   optional float lars_weight_decay = 2 [ default = 0.0005 ];
+  optional float epsilon = 3 [ default = 0.0 ];
+  repeated string exclude_from_weight_decay = 4;
 }
 
 message LambConfig {
diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.cc b/paddle/fluid/framework/fleet/nccl_wrapper.cc
index d5a25605cf81147b520bf541e38f4f75e53ae756..33a91388fd8cc97d181df46ab826d384860d38f5 100644
--- a/paddle/fluid/framework/fleet/nccl_wrapper.cc
+++ b/paddle/fluid/framework/fleet/nccl_wrapper.cc
@@ -25,7 +25,7 @@ bool NCCLWrapper::is_initialized_ = false;
 
 void NCCLWrapper::InitNCCL() {
 #if defined(PADDLE_WITH_NCCL)
-  PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
+  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
       &(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_,
       nccl_info_.my_global_rank_));
 #endif
@@ -41,7 +41,8 @@ void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) {
 
 NCCLInfo NCCLWrapper::GetNCCLId() {
 #if defined(PADDLE_WITH_NCCL)
-  PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_)));
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_)));
 #endif
   return nccl_info_;
 }
@@ -52,8 +53,8 @@ void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank,
   nccl_info_.local_rank_ = local_rank;
   nccl_info_.my_global_rank_ = global_rank;
   nccl_info_.global_ranks_ = ranks;
-  PADDLE_ENFORCE(cudaSetDevice(local_rank));
-  PADDLE_ENFORCE(cudaStreamCreate(&(nccl_info_.stream_)));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(local_rank));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&(nccl_info_.stream_)));
 #endif
   return;
 }
@@ -65,7 +66,7 @@ void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope,
     auto var = scope.FindVar(name);
     LoDTensor* tensor = var->GetMutable<LoDTensor>();
     int32_t total_size = tensor->numel();
-    PADDLE_ENFORCE(platform::dynload::ncclBcast(
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
         reinterpret_cast<void*>(tensor->data<float>()), total_size, ncclFloat,
         root_rank, nccl_info_.comm_, nccl_info_.stream_));
     cudaStreamSynchronize(nccl_info_.stream_);
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 8787aa8a94a44c2c36868fea4b88ede5f91b19f4..5bb833f613529a81d5ae4e18fc5ad7cd1136354b 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -102,6 +102,8 @@ if(WITH_MKLDNN)
     pass_library(conv_concat_relu_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(conv_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(scale_matmul_fuse_pass inference DIR mkldnn)
+    pass_library(cpu_bfloat16_placement_pass inference DIR mkldnn)
+    pass_library(cpu_bfloat16_pass inference DIR mkldnn)
     pass_library(fc_mkldnn_pass inference DIR mkldnn)
     pass_library(cpu_quantize_placement_pass base DIR mkldnn)
     pass_library(cpu_quantize_pass inference DIR mkldnn)
@@ -162,4 +164,6 @@ endif()
     cc_test(test_cpu_quantize_squash_pass SRCS mkldnn/cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
     cc_test(test_reshape_transpose_matmul_mkldnn_fuse_pass SRCS mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc DEPS reshape_transpose_matmul_mkldnn_fuse_pass)
     cc_test(test_matmul_transpose_reshape_fuse_pass SRCS mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc DEPS matmul_transpose_reshape_fuse_pass)
+    cc_test(test_cpu_bfloat16_placement_pass SRCS mkldnn/cpu_bfloat16_placement_pass_tester.cc DEPS cpu_bfloat16_placement_pass)
+    cc_test(test_cpu_bfloat16_pass SRCS mkldnn/cpu_bfloat16_pass_tester.cc DEPS cpu_bfloat16_pass)
 endif ()
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 3d65fe595373fa98ba237f04134c75d4a60a7242..9c1eaa99a3ca04ddbeecab639d5587d5509e3f00 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1892,6 +1892,82 @@ PDNode *patterns::QuantizePlacement::operator()(
   return op;
 }
 
+PDNode *patterns::Bfloat16Placement::operator()(
+    const std::unordered_set<std::string> &bfloat16_enabled_op_types) {
+  std::unordered_set<std::string> supported_op_types =
+      std::unordered_set<std::string>();
+  if (!bfloat16_enabled_op_types.empty()) {
+    supported_op_types = bfloat16_enabled_op_types;
+  }
+  auto *op = pattern->NewNode(op_repr())->assert_is_ops(supported_op_types);
+  return op;
+}
+
+PDNode *patterns::OrphanedBfloat16::operator()() {
+  auto *prev_op = pattern->NewNode(prev_op_repr())->assert_is_op();
+  prev_op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
+           "float32";
+  });
+  auto *prev_out = pattern->NewNode(prev_out_repr())->AsOutput();
+
+  auto *op = pattern->NewNode(op_repr())->assert_is_op();
+  op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
+           "bfloat16";
+  });
+  auto *op_out = pattern->NewNode(op_out_repr())->AsOutput();
+
+  auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
+  next_op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
+           "float32";
+  });
+
+  prev_op->LinksTo({prev_out});
+  op->LinksFrom({prev_out}).LinksTo({op_out});
+  next_op->LinksFrom({op_out});
+  return next_op;
+}
+
+PDNode *patterns::LastBfloat16Ops::operator()() {
+  auto *op = pattern->NewNode(op_repr())->assert_is_op();
+  op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
+           "bfloat16";
+  });
+  auto *op_out = pattern->NewNode(op_out_repr())->AsOutput();
+
+  auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
+  next_op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") !=
+           "bfloat16";
+  });
+
+  op->LinksTo({op_out});
+  next_op->LinksFrom({op_out});
+  return next_op;
+}
+
+PDNode *patterns::FirstBfloat16Ops::operator()() {
+  auto *prev_op = pattern->NewNode(prev_op_repr())->assert_is_op();
+  prev_op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") !=
+           "bfloat16";
+  });
+  auto *op_in = pattern->NewNode(op_in_repr())->AsOutput();
+
+  auto *op = pattern->NewNode(op_repr())->assert_is_op();
+  op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
+           "bfloat16";
+  });
+
+  prev_op->LinksTo({op_in});
+  op->LinksFrom({op_in});
+  return op;
+}
+
 PDNode *patterns::MKLDNNInPlace::operator()() {
   const std::unordered_set<std::string> &supported_op_types = {
       "abs",
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 0803265884165bc754489b18d07c0d277a4bd92b..053c1fe832b0088d2abdd3f8eb40a0042e5e2dfe 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1129,6 +1129,47 @@ struct QuantizePlacement : public PatternBase {
   PATTERN_DECL_NODE(op);
 };
 
+struct Bfloat16Placement : public PatternBase {
+  Bfloat16Placement(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "bfloat16_placement") {}
+  PDNode* operator()(
+      const std::unordered_set<std::string>& bfloat16_enabled_op_types);
+
+  PATTERN_DECL_NODE(op);
+};
+
+struct OrphanedBfloat16 : public PatternBase {
+  OrphanedBfloat16(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "orphaned_bfloat16") {}
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(prev_op);
+  PATTERN_DECL_NODE(prev_out);
+  PATTERN_DECL_NODE(op);
+  PATTERN_DECL_NODE(op_out);
+  PATTERN_DECL_NODE(next_op);
+};
+
+struct LastBfloat16Ops : public PatternBase {
+  LastBfloat16Ops(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "last_bfloat16_ops") {}
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(op);
+  PATTERN_DECL_NODE(op_out);
+  PATTERN_DECL_NODE(next_op);
+};
+
+struct FirstBfloat16Ops : public PatternBase {
+  FirstBfloat16Ops(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "first_bfloat16_ops") {}
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(prev_op);
+  PATTERN_DECL_NODE(op_in);
+  PATTERN_DECL_NODE(op);
+};
+
 // Pattern used for enforcing inplace computation for in-place computation
 // supporting DNNL ops. softmax, batch_norm and layer_norm
 struct MKLDNNInPlace : public PatternBase {
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..df498865245fc8054f9521026e0b5cd6906b136f
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
@@ -0,0 +1,159 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h"
+
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using string::PrettyLogDetail;
+
+void UnlinkNodes(ir::Node* a, ir::Node* b) {
+  a->outputs.erase(std::remove(a->outputs.begin(), a->outputs.end(), b),
+                   a->outputs.end());
+  b->inputs.erase(std::remove(b->inputs.begin(), b->inputs.end(), a),
+                  b->inputs.end());
+}
+
+void CPUBFloat16Pass::SetInputDataType(ir::Graph* graph) const {
+  GraphPatternDetector gpd;
+  patterns::FirstBfloat16Ops bfloat16_ops{gpd.mutable_pattern(),
+                                          "first_bfloat16_ops"};
+  bfloat16_ops();
+  int quantize_counter = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, bfloat16_ops);
+    GET_IR_NODE_FROM_SUBGRAPH(op_in, op_in, bfloat16_ops);
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_ops);
+
+    if (op->Op()->Type() != "conv2d" && prev_op->Op()->Type() != "quantize") {
+      VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out"));
+      auto* quantize_out_node = g->CreateVarNode(&quantize_out_desc);
+
+      // create a quantize op node
+      OpDesc q_desc;
+      q_desc.SetType("quantize");
+      q_desc.SetInput("Input", std::vector<std::string>({op_in->Name()}));
+      q_desc.SetOutput("Output",
+                       std::vector<std::string>({quantize_out_node->Name()}));
+      q_desc.SetAttr("Scale", 1.f);
+      q_desc.SetAttr("bfloat16", true);
+      q_desc.SetAttr("output_format", Has("data_layout")
+                                          ? Get<std::string>("data_layout")
+                                          : "NCHW");
+      auto quantize_op = g->CreateOpNode(&q_desc);  // OpDesc will be copied.
+
+      std::string op_input_name;
+      for (auto name : op->Op()->InputNames()) {
+        for (auto input_name : op->Op()->Input(name)) {
+          if (input_name == op_in->Name()) op_input_name = name;
+        }
+      }
+
+      PADDLE_ENFORCE_NE(
+          op_input_name.empty(), true,
+          platform::errors::NotFound(
+              "Operator before operator should have input as op output"));
+
+      op->Op()->SetInput(op_input_name,
+                         std::vector<std::string>({quantize_out_node->Name()}));
+
+      UnlinkNodes(op_in, op);
+      IR_NODE_LINK_TO(op_in, quantize_op);
+      IR_NODE_LINK_TO(quantize_op, quantize_out_node);
+      IR_NODE_LINK_TO(quantize_out_node, op);
+      quantize_counter++;
+    }
+  };
+  gpd(graph, handler);
+  PrettyLogDetail("---    added %d quantize op before bfloat16 op",
+                  quantize_counter);
+}
+
+void CPUBFloat16Pass::SetOutputDataType(ir::Graph* graph) const {
+  GraphPatternDetector gpd;
+  patterns::LastBfloat16Ops bfloat16_ops{gpd.mutable_pattern(),
+                                         "last_bfloat16_ops"};
+  bfloat16_ops();
+  int force_fp32_counter = 0, dequantize_counter = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_ops);
+    GET_IR_NODE_FROM_SUBGRAPH(op_out, op_out, bfloat16_ops);
+    GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, bfloat16_ops);
+
+    if ((op->Op()->HasAttr("force_fp32_output") ||
+         op->Op()->HasProtoAttr("force_fp32_output")) &&
+        !op->Op()->GetAttrIfExists<bool>("fuse_residual_connection")) {
+      op->Op()->SetAttr("force_fp32_output", true);
+      force_fp32_counter++;
+    } else if (op->Op()->Type() != "prior_box") {
+      // Create dequantize input variable
+      VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in"));
+      auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc);
+
+      // create a dequantize op node for output.
+      OpDesc deq_desc;
+      deq_desc.SetType("dequantize");
+      deq_desc.SetInput("Input",
+                        std::vector<std::string>({dequantize_in_node->Name()}));
+      deq_desc.SetOutput("Output", std::vector<std::string>({op_out->Name()}));
+      deq_desc.SetAttr("Scale", 1.0f);
+      auto dequantize_op = g->CreateOpNode(&deq_desc);
+
+      std::string op_output_name;
+      for (auto name : op->Op()->OutputNames()) {
+        for (auto output_name : op->Op()->Output(name)) {
+          if (output_name == op_out->Name()) op_output_name = name;
+        }
+      }
+
+      PADDLE_ENFORCE_NE(
+          op_output_name.empty(), true,
+          platform::errors::NotFound(
+              "Operator after operator should have input as op output"));
+
+      op->Op()->SetOutput(op_output_name, std::vector<std::string>(
+                                              {dequantize_in_node->Name()}));
+
+      UnlinkNodes(op, op_out);
+      IR_NODE_LINK_TO(op, dequantize_in_node);
+      IR_NODE_LINK_TO(dequantize_in_node, dequantize_op);
+      IR_NODE_LINK_TO(dequantize_op, op_out);
+      dequantize_counter++;
+    }
+  };
+  gpd(graph, handler);
+  PrettyLogDetail("---    added %d dequantize op and used %d force_fp32_output",
+                  dequantize_counter, force_fp32_counter);
+}
+
+void CPUBFloat16Pass::ApplyImpl(ir::Graph* graph) const {
+  SetInputDataType(graph);
+  SetOutputDataType(graph);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(cpu_bfloat16_pass, paddle::framework::ir::CPUBFloat16Pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a7271f7ddc59a2bdcab8457bc34d5c5c6397268
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class CPUBFloat16Pass : public Pass {
+ protected:
+  void SetInputDataType(ir::Graph* graph) const;
+  void SetOutputDataType(ir::Graph* graph) const;
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..15109db98321343e73fb0c3839e4f7ddf2490948
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h"
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs, bool use_mkldnn,
+           const std::string& mkldnn_data_type = "float32",
+           const bool force_fp32_output = false) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("use_mkldnn", use_mkldnn);
+  op->SetAttr("name", name);
+
+  if (type == "conv2d") {
+    op->SetInput("Input", {inputs[0]});
+    op->SetOutput("Output", {outputs[0]});
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+    op->SetAttr("force_fp32_output", force_fp32_output);
+  } else if (type == "pool2d" || type == "transpose2" || type == "reshape2" ||
+             type == "dropout") {
+    op->SetInput("X", {inputs[0]});
+    op->SetOutput("Out", {outputs[0]});
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+  } else if (type == "fc") {
+    op->SetInput("Input", {inputs[0]});
+    op->SetOutput("Out", {outputs[0]});
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+  } else if (type == "concat") {
+    op->SetInput("X", inputs);
+    op->SetOutput("Out", outputs);
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+  } else if (type == "matmul" || type == "elementwise_add") {
+    op->SetInput("X", {inputs[0]});
+    if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
+    op->SetOutput("Out", {outputs[0]});
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+  }
+}
+
+void PreparePass(std::unique_ptr<ir::Graph>* graph, const ProgramDesc& prog,
+                 const std::initializer_list<std::string> variable_names,
+                 int* original_nodes_num, int* current_nodes_num) {
+  auto pass = PassRegistry::Instance().Get("cpu_bfloat16_pass");
+
+  graph->reset(pass->Apply(graph->release()));
+
+  *original_nodes_num = (*graph)->Nodes().size();
+  (*graph).reset(pass->Apply((*graph).release()));
+  *current_nodes_num = (*graph)->Nodes().size();
+}
+
+static const std::initializer_list<std::string> variable_names{
+    "z", "a", "b", "c", "d", "e", "f", "g", "h", "i"};
+
+ProgramDesc BuildProgramDesc(bool use_mkldnn) {
+  ProgramDesc prog;
+  for (auto& v : variable_names) {
+    prog.MutableBlock(0)->Var(v);
+  }
+  SetOp(&prog, "dropout", "Dropout1", {"z"}, {"a"}, use_mkldnn, "float32");
+  SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "pool2d", "Pool1", {"b"}, {"c"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "conv2d", "Conv1", {"c"}, {"d"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "dropout", "Dropout2", {"d"}, {"e"}, use_mkldnn, "float32");
+  SetOp(&prog, "transpose2", "Transpose1", {"e"}, {"f"}, use_mkldnn,
+        "bfloat16");
+  SetOp(&prog, "reshape2", "Reshape1", {"f"}, {"g"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "concat", "Concat1", {"g"}, {"h"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "dropout", "Dropout3", {"h"}, {"i"}, use_mkldnn, "float32");
+
+  return prog;
+}
+
+void MainTest(const ProgramDesc& prog, int conv_count, int pool_count,
+              int transpose_count, int quant_count, int dequant_count,
+              int added_nodes_count) {
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  int original_nodes_num, current_nodes_num;
+  PreparePass(&graph, prog, variable_names, &original_nodes_num,
+              &current_nodes_num);
+
+  int quantize_nodes_count = 0;
+  int dequantize_nodes_count = 0;
+  int conv2d_nodes_count = 0;
+  int pool2d_nodes_count = 0;
+  int transpose2_nodes_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->Type() == "conv2d") {
+        conv2d_nodes_count++;
+      } else if (op->Type() == "pool2d") {
+        pool2d_nodes_count++;
+      } else if (op->Type() == "transpose2") {
+        transpose2_nodes_count++;
+      } else if (op->Type() == "quantize") {
+        quantize_nodes_count++;
+      } else if (op->Type() == "dequantize") {
+        dequantize_nodes_count++;
+      }
+    }
+  }
+  EXPECT_EQ(conv2d_nodes_count, conv_count);
+  EXPECT_EQ(pool2d_nodes_count, pool_count);
+  EXPECT_EQ(transpose2_nodes_count, transpose_count);
+  EXPECT_EQ(quantize_nodes_count, quant_count);
+  EXPECT_EQ(dequantize_nodes_count, dequant_count);
+  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
+}
+
+TEST(CpuQuantizePass, quantize) {
+  bool use_mkldnn = true;
+  // 1 quantize + 1 dequantize
+  int added_nodes = 2;
+  MainTest(BuildProgramDesc(use_mkldnn), 2, 1, 1, 1, 2, added_nodes);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(cpu_bfloat16_pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d7a9c1107bbaac04a3a478014520a9b340b1d5f
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h"
+
+#include <string>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using string::PrettyLogDetail;
+
+void CPUBfloat16PlacementPass::SetMkldnnDataType(
+    ir::Graph* graph, int* bfloat16_operators) const {
+  const auto& op_types_list =
+      Get<std::unordered_set<std::string>>("bfloat16_enabled_op_types");
+  // set mkldnn_data_type to bfloat16 to all operators that are in
+  // bfloat16_enabled_op_types vector or they are included to Bfloat16Placement
+  // pattern
+  GraphPatternDetector gpd;
+  patterns::Bfloat16Placement bfloat16_placement_pattern{gpd.mutable_pattern(),
+                                                         "bfloat16_placement"};
+  bfloat16_placement_pattern(op_types_list);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_placement_pattern);
+
+    if ((op->Op()->HasAttr("mkldnn_data_type") ||
+         op->Op()->HasProtoAttr("mkldnn_data_type")) &&
+        !platform::HasOpINT8DataType(op->Op())) {
+      op->Op()->SetAttr("mkldnn_data_type", std::string("bfloat16"));
+      (*bfloat16_operators)++;
+    }
+  };
+  gpd(graph, handler);
+}
+
+void CPUBfloat16PlacementPass::RemoveOrhanedOperators(
+    ir::Graph* graph, int* bfloat16_operators) const {
+  // find orphaned bfloat16 operator that is between two float32 operators
+  // revert mkldnn_data_type attr to float32
+  GraphPatternDetector gpd;
+  patterns::OrphanedBfloat16 orphaned_bfloat16_pattern{gpd.mutable_pattern(),
+                                                       "orphaned_bfloat16"};
+  orphaned_bfloat16_pattern();
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, orphaned_bfloat16_pattern);
+
+    op->Op()->SetAttr("mkldnn_data_type", std::string("float32"));
+    bfloat16_operators--;
+  };
+  gpd(graph, handler);
+}
+
+void CPUBfloat16PlacementPass::ApplyImpl(ir::Graph* graph) const {
+  int bfloat16_operators = 0;
+  SetMkldnnDataType(graph, &bfloat16_operators);
+  RemoveOrhanedOperators(graph, &bfloat16_operators);
+  PrettyLogDetail("---    marked %d operators to bfloat16 ",
+                  bfloat16_operators);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(cpu_bfloat16_placement_pass,
+              paddle::framework::ir::CPUBfloat16PlacementPass)
+    // a vector of operator type names with bfloat16 support ("conv2d" etc.)
+    // the second param is the default value for this vector
+    .DefaultPassAttr("bfloat16_enabled_op_types",
+                     new std::unordered_set<std::string>());
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..1911b1a3cb32a6a23585e8240c462aa84e8d869b
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+/*
+ * Specifies which operators should be run on bfloat16.
+ */
+class CPUBfloat16PlacementPass : public Pass {
+ protected:
+  void SetMkldnnDataType(ir::Graph* graph, int* bfloat16_operators) const;
+
+  void RemoveOrhanedOperators(ir::Graph* graph, int* bfloat16_operators) const;
+
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b9797a4bfcc0048083e059cb003746e3278a039b
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
@@ -0,0 +1,132 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs,
+           const std::string& mkldnn_data_type = "float32") {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+
+  op->SetType(type);
+  op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+
+  if (type == "conv2d") {
+    op->SetAttr("name", name);
+    op->SetInput("Input", {inputs[0]});
+  } else if (type == "relu") {
+    op->SetInput("X", inputs);
+  } else if (type == "concat") {
+    op->SetAttr("axis", 1);
+    op->SetInput("X", {inputs[0], inputs[1]});
+  } else if (type == "pool2d") {
+    op->SetInput("X", {inputs[0]});
+  } else {
+    FAIL() << "Unexpected operator type.";
+  }
+  op->SetOutput("Out", {outputs[0]});
+}
+
+// operator                      mkldnn_data_type
+// ---------------------------------------
+// (a,b)->concat->c              float32
+// c->conv->f                    float32
+// f->relu->g                    float32
+// g->pool->h                    float32
+// h->conv->k                    float32
+// k->pool->l                    float32
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+
+  for (auto& v :
+       std::vector<std::string>({"a", "b", "c", "f", "g", "h", "k", "l"})) {
+    prog.MutableBlock(0)->Var(v);
+  }
+
+  SetOp(&prog, "concat", "concat1", {"a", "b"}, {"c"});
+  SetOp(&prog, "conv2d", "conv1", {"c"}, {"f"});
+  SetOp(&prog, "relu", "relu1", {"f"}, {"g"});
+  SetOp(&prog, "pool2d", "pool1", {"g"}, {"h"});
+  SetOp(&prog, "conv2d", "conv2", {"h"}, {"k"});
+  SetOp(&prog, "pool2d", "pool2", {"k"}, {"l"});
+
+  return prog;
+}
+
+void MainTest(std::initializer_list<std::string> bfloat16_enabled_op_types,
+              unsigned expected_bfloat16_data_type_count) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("cpu_bfloat16_placement_pass");
+  pass->Set("bfloat16_enabled_op_types",
+            new std::unordered_set<std::string>(bfloat16_enabled_op_types));
+
+  graph.reset(pass->Apply(graph.release()));
+
+  unsigned bfloat16_data_type_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      if (platform::HasOpBFLOAT16DataType(node->Op())) {
+        ++bfloat16_data_type_count;
+      }
+    }
+  }
+
+  EXPECT_EQ(bfloat16_data_type_count, expected_bfloat16_data_type_count);
+}
+
+void DefaultAttrTest(unsigned expected_bfloat16_data_type_count) {
+  auto prog = BuildProgramDesc();
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  auto pass = PassRegistry::Instance().Get("cpu_bfloat16_placement_pass");
+  graph.reset(pass->Apply(graph.release()));
+
+  unsigned bfloat16_data_type_count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      if (platform::HasOpBFLOAT16DataType(node->Op())) {
+        ++bfloat16_data_type_count;
+      }
+    }
+  }
+  EXPECT_EQ(bfloat16_data_type_count, expected_bfloat16_data_type_count);
+}
+
+TEST(Bfloat16PlacementPass, enable_all) {
+  MainTest({"conv2d", "pool2d", "relu", "concat"}, 6);
+}
+
+TEST(Bfloat16PlacementPass, enabled_conv_and_pool) {
+  // 2 conv2d + 2 pool2 - 1 orphaned conv2d
+  MainTest({"conv2d", "pool2d"}, 3);
+}
+
+TEST(Bfloat16PlacementPass, default_attr_value) { DefaultAttrTest(0); }
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(cpu_bfloat16_placement_pass);
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index 40e01c75bb99157aedccd0692d7410b99393c009..198107ea082dc86d9e65a926bf9befe2fc4abfa4 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -615,6 +615,16 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope,
     GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv_out, transpose2_qkv_out,
                               multihead_pattern);
 
+    // If weights or biases in qkv's fc are shared by multiple multihead_matmul
+    // patterns, we do not support this kind of fusion, this pass will not take
+    // effect.
+    bool is_fc_params_shared =
+        mul0_w->outputs.size() > 1 || mul1_w->outputs.size() > 1 ||
+        mul2_w->outputs.size() > 1 || eltadd0_b->outputs.size() > 1 ||
+        eltadd1_b->outputs.size() > 1 || eltadd2_b->outputs.size() > 1;
+    if (is_fc_params_shared) {
+      return;
+    }
     fuse_creater(input0, mul0, mul1, mul2, mul0_out, mul1_out, mul2_out, mul0_w,
                  mul1_w, mul2_w, eltadd0_b, eltadd1_b, eltadd2_b, eltadd_qk_b,
                  reshape2_0, reshape2_qkv_out, scale, scale_out);
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
index 9a0a5f07a7080593d8f13e07788c703edb92c7ad..405cefa99ebbbe147fc96f63567e13607732780e 100644
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -145,3 +146,11 @@ void TransposeFlattenConcatFusePass::ApplyImpl(ir::Graph *graph) const {
 
 REGISTER_PASS(transpose_flatten_concat_fuse_pass,
               paddle::framework::ir::TransposeFlattenConcatFusePass);
+REGISTER_PASS_CAPABILITY(transpose_flatten_concat_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("transpose", 0)
+            .EQ("transpose2", 0)
+            .EQ("flatten", 0)
+            .EQ("concat", 0)
+            .EQ("fusion_transpose_flatten_concat", 0));
diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h
index 171f08390765a64997bff28d80f9360f5da2cd1a..89b499975790060a3a3e3f665c35f8545922e6a7 100644
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
@@ -69,7 +69,8 @@ class OpInfo {
 
   const OpCreator& Creator() const {
     PADDLE_ENFORCE_NOT_NULL(creator_,
-                            "Operator's Creator has not been registered");
+                            platform::errors::NotFound(
+                                "Operator's Creator has not been registered."));
     return creator_;
   }
 
@@ -79,11 +80,12 @@ class OpInfo {
     std::string type = proto_ ? proto_->type() : "unknown";
     PADDLE_ENFORCE_NOT_NULL(
         grad_op_maker_,
-        "Operator %s's GradOpMaker has not been "
-        "registered.\nPlease check whether %s_op has "
-        "grad_op.\nIf not, please set stop_gradient to True "
-        "for its input and output variables using var.stop_gradient=True.",
-        type.c_str(), type.c_str());
+        platform::errors::NotFound(
+            "Operator %s's GradOpMaker has not been "
+            "registered.\nPlease check whether (%s) operator has "
+            "gradient operator.\nIf not, please set stop_gradient to be True "
+            "for its input and output variables using var.stop_gradient=True.",
+            type.c_str(), type.c_str()));
     return grad_op_maker_;
   }
 
@@ -100,11 +102,12 @@ class OpInfo {
     std::string type = proto_ ? proto_->type() : "unknown";
     PADDLE_ENFORCE_NOT_NULL(
         dygraph_grad_op_maker_,
-        "Operator %s's DygraphGradOpMaker has not been "
-        "registered.\nPlease check whether %s_op has "
-        "grad_op.\nIf not, please set stop_gradient to True "
-        "for its input and output variables using var.stop_gradient=True.",
-        type.c_str(), type.c_str());
+        platform::errors::NotFound(
+            "Operator %s's DygraphGradOpMaker has not been "
+            "registered.\nPlease check whether (%s) operator has "
+            "gradient operator.\nIf not, please set stop_gradient to be True "
+            "for its input and output variables using var.stop_gradient=True.",
+            type.c_str(), type.c_str()));
     return dygraph_grad_op_maker_;
   }
 
@@ -130,14 +133,17 @@ class OpInfoMap {
   }
 
   void Insert(const std::string& type, const OpInfo& info) {
-    PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type);
+    PADDLE_ENFORCE_NE(Has(type), true,
+                      platform::errors::AlreadyExists(
+                          "Operator (%s) has been registered.", type));
     map_.insert({type, info});
   }
 
   const OpInfo& Get(const std::string& type) const {
     auto op_info_ptr = GetNullable(type);
-    PADDLE_ENFORCE_NOT_NULL(op_info_ptr, "Operator %s has not been registered",
-                            type);
+    PADDLE_ENFORCE_NOT_NULL(
+        op_info_ptr,
+        platform::errors::NotFound("Operator (%s) is not registered.", type));
     return *op_info_ptr;
   }
 
diff --git a/paddle/fluid/framework/op_kernel_type.cc b/paddle/fluid/framework/op_kernel_type.cc
index 6d4801e4a0eed7083e671e1d49b8628dfb280cf9..e64c3674e7433bb1d9e54f89b89e5f1e2c521648 100644
--- a/paddle/fluid/framework/op_kernel_type.cc
+++ b/paddle/fluid/framework/op_kernel_type.cc
@@ -33,10 +33,18 @@ size_t OpKernelType::Hash::operator()(const OpKernelType& key) const {
   cur_loc += OpKernelType::kLibBits;
 
   int customized_value = key.customized_type_value_;
-  PADDLE_ENFORCE(customized_value < (1 << OpKernelType::kCustomizeBits));
+  PADDLE_ENFORCE_LT(customized_value, (1 << OpKernelType::kCustomizeBits),
+                    platform::errors::Unavailable(
+                        "Too many custom OpKernel attribute values, expected "
+                        "maximum value is %d, received value is %d.",
+                        (1 << OpKernelType::kCustomizeBits), customized_value));
   customized_value = customized_value << cur_loc;
   cur_loc += OpKernelType::kCustomizeBits;
-  PADDLE_ENFORCE(cur_loc < 64);
+  PADDLE_ENFORCE_LT(cur_loc, 64,
+                    platform::errors::Unavailable(
+                        "Too many OpKernel attribute values, expected maximum "
+                        "value is 64, received value is %d.",
+                        cur_loc));
 
   std::hash<int> hasher;
   return hasher(place + data_type + data_layout + library_type +
diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index 3408ab262c16197b92e407d0af6043c8a062b5d4..357c4fb5e57fb5b9172631ca57fbdbfeb19b3143 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -43,7 +43,9 @@ OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput(
 void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
   std::unordered_set<std::string> names;
   auto checker = [&](const std::string& name) {
-    PADDLE_ENFORCE(!names.count(name), "[%s] is duplicated", name);
+    PADDLE_ENFORCE_EQ(
+        names.count(name), 0,
+        platform::errors::AlreadyExists("Attribute [%s] is duplicated.", name));
     names.insert(name);
   };
   for (auto& attr : proto_->attrs()) {
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index d8159d6a5c294b85d8d5ab9bbee3b95a5eba793f..6408fadf90ae32adf048156d1369cf22a76d20ea 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -54,9 +54,10 @@ class Registrar {
 template <typename... ARGS>
 struct OperatorRegistrar : public Registrar {
   explicit OperatorRegistrar(const char* op_type) {
-    if (OpInfoMap::Instance().Has(op_type)) {
-      PADDLE_THROW("'%s' is registered more than once.", op_type);
-    }
+    PADDLE_ENFORCE_EQ(
+        OpInfoMap::Instance().Has(op_type), false,
+        platform::errors::AlreadyExists(
+            "Operator '%s' is registered more than once.", op_type));
     static_assert(sizeof...(ARGS) != 0,
                   "OperatorRegistrar should be invoked at least by OpClass");
     OpInfo info;
diff --git a/paddle/fluid/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc
index 21d3454467603c58c9513351eba2c09ef6eeacba..45fe66d7db3b546604b640008e0ab61eaa84390e 100644
--- a/paddle/fluid/framework/op_registry_test.cc
+++ b/paddle/fluid/framework/op_registry_test.cc
@@ -58,7 +58,8 @@ class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
     AddInput("input", "input of cosine op").AsDuplicable();
     AddOutput("output", "output of cosine op").AsIntermediate();
     auto my_checker = [](int i) {
-      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
+      PADDLE_ENFORCE_EQ(i % 2, 0, platform::errors::InvalidArgument(
+                                      "'test_attr' must be even!"));
     };
     AddAttr<int>("test_attr", "a simple test attribute")
         .AddCustomChecker(my_checker);
diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h
index 5edd70e035f98f408c0104297e084771cd158f53..fea043a0ff311f7b940331b9d392296c331590e9 100644
--- a/paddle/fluid/framework/op_version_registry.h
+++ b/paddle/fluid/framework/op_version_registry.h
@@ -152,10 +152,10 @@ class OpVersionRegistrar {
     return instance;
   }
   OpVersion& Register(const std::string& op_type) {
-    if (op_version_map_.find(op_type) != op_version_map_.end()) {
-      PADDLE_THROW("'%s' is registered in operator version more than once.",
-                   op_type);
-    }
+    PADDLE_ENFORCE_EQ(
+        op_version_map_.find(op_type), op_version_map_.end(),
+        platform::errors::AlreadyExists(
+            "'%s' is registered in operator version more than once.", op_type));
     op_version_map_.insert({op_type, OpVersion()});
     return op_version_map_[op_type];
   }
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index ca2705f154c4f45dfccd954b23209c71701adce5..21fc293e84179da72be8cc5ee50de46a00fe9a0d 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -164,15 +164,20 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
     VLOG(4) << place << " " << DebugStringEx(&scope);
     if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW("Cannot run operator on place %s", place);
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Cannot run operator on place %s, please recompile paddle or "
+          "reinstall Paddle with CUDA support.",
+          place));
 #else
       auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
       platform::SetDeviceId(dev_id);
 #endif
     } else if (platform::is_xpu_place(place)) {
 #ifndef PADDLE_WITH_XPU
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Cannot run operator on place %s", place));
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Cannot run operator on place %s, please recompile paddle or "
+          "reinstall Paddle with XPU support.",
+          place));
 #else
       auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
       platform::SetXPUDeviceId(dev_id);
@@ -214,7 +219,7 @@ std::string OperatorBase::Input(const std::string& name) const {
   auto& ins = Inputs(name);
   PADDLE_ENFORCE_LE(
       ins.size(), 1UL,
-      platform::errors::AlreadyExists(
+      platform::errors::InvalidArgument(
           "Operator %s's input %s should contain only one variable.", type_,
           name));
   return ins.empty() ? kEmptyVarName : ins[0];
@@ -223,8 +228,10 @@ std::string OperatorBase::Input(const std::string& name) const {
 const std::vector<std::string>& OperatorBase::Inputs(
     const std::string& name) const {
   auto it = inputs_.find(name);
-  PADDLE_ENFORCE(it != inputs_.end(), "Operator %s does not have the input %s.",
-                 type_, name);
+  PADDLE_ENFORCE_NE(
+      it, inputs_.end(),
+      platform::errors::NotFound("Operator %s does not have the input %s.",
+                                 type_, name));
   return it->second;
 }
 
@@ -238,17 +245,21 @@ bool OperatorBase::HasOutputs(const std::string& name) const {
 
 std::string OperatorBase::Output(const std::string& name) const {
   auto& outs = Outputs(name);
-  PADDLE_ENFORCE_LE(outs.size(), 1UL,
-                    "Operator %s's output %s should contain only one variable.",
-                    type_, name);
+  PADDLE_ENFORCE_LE(
+      outs.size(), 1UL,
+      platform::errors::InvalidArgument(
+          "Operator %s's output %s should contain only one variable.", type_,
+          name));
   return outs.empty() ? kEmptyVarName : outs[0];
 }
 
 const std::vector<std::string>& OperatorBase::Outputs(
     const std::string& name) const {
   auto it = outputs_.find(name);
-  PADDLE_ENFORCE(it != outputs_.end(),
-                 "Operator %s does not have an output called %s.", type_, name);
+  PADDLE_ENFORCE_NE(
+      it, outputs_.end(),
+      platform::errors::NotFound(
+          "Operator %s does not have an output called %s.", type_, name));
   return it->second;
 }
 
@@ -391,16 +402,19 @@ void OperatorBase::CheckAllInputOutputSet() const {
 
   for (auto& in : info_->Proto().inputs()) {
     if (!in.dispensable()) {
-      PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(),
-                     "Operator %s's input, %s, is not set", Type(), in.name());
+      PADDLE_ENFORCE_NE(
+          inputs_.find(in.name()), inputs_.end(),
+          platform::errors::NotFound("Operator %s's input (%s) is not set.",
+                                     Type(), in.name()));
     }
   }
 
   for (auto& out : info_->Proto().outputs()) {
     if (!out.dispensable()) {
-      PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(),
-                     "Operator %s's output, %s, is not set", Type(),
-                     out.name());
+      PADDLE_ENFORCE_NE(
+          outputs_.find(out.name()), outputs_.end(),
+          platform::errors::NotFound("Operator %s's output (%s) is not set.",
+                                     Type(), out.name()));
     }
   }
 }
@@ -428,8 +442,9 @@ const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) {
   } else if (var.IsType<SelectedRows>()) {
     return &(var.Get<SelectedRows>().value());
   } else {
-    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
-                 ToTypeName(var.Type()));
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Variable type is %s, expect LoDTensor or SelectedRows.",
+        ToTypeName(var.Type())));
   }
 }
 
@@ -439,8 +454,9 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) {
   } else if (var->IsType<SelectedRows>()) {
     return var->GetMutable<SelectedRows>()->mutable_value();
   } else {
-    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
-                 ToTypeName(var->Type()));
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Variable type is %s, expect LoDTensor or SelectedRows.",
+        ToTypeName(var->Type())));
   }
 }
 
@@ -462,7 +478,7 @@ const Variable* ExecutionContext::InputVar(const std::string& name) const {
 
   PADDLE_ENFORCE_LE(
       it->second.size(), 1UL,
-      platform::errors::AlreadyExists(
+      platform::errors::InvalidArgument(
           "Operator %s's input %s should contain only one variable.",
           op_.Type(), name));
   return it->second.empty() ? nullptr : it->second[0];
@@ -472,9 +488,11 @@ Variable* ExecutionContext::OutputVar(const std::string& name) const {
   auto it = ctx_.outputs.find(name);
   if (it == ctx_.outputs.end()) return nullptr;
 
-  PADDLE_ENFORCE_LE(it->second.size(), 1UL,
-                    "Operator %s's output %s should contain only one variable.",
-                    op_.Type(), name);
+  PADDLE_ENFORCE_LE(
+      it->second.size(), 1UL,
+      platform::errors::InvalidArgument(
+          "Operator %s's output %s should contain only one variable.",
+          op_.Type(), name));
   return it->second.empty() ? nullptr : it->second[0];
 }
 
@@ -497,10 +515,11 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
   std::transform(vars.begin(), vars.end(), std::back_inserter(res),
                  [&](const Variable* var) -> const Tensor* {
                    if (var == nullptr) return nullptr;
-                   PADDLE_ENFORCE(
-                       var->IsType<LoDTensor>(),
-                       "should be LoDTensor, but the received type is %s",
-                       ToTypeName(var->Type()));
+                   PADDLE_ENFORCE_EQ(var->IsType<LoDTensor>(), true,
+                                     platform::errors::InvalidArgument(
+                                         "Input variable should be LoDTensor, "
+                                         "but the received type is %s.",
+                                         ToTypeName(var->Type())));
                    return &(var->Get<LoDTensor>());
                  });
   return res;
@@ -558,8 +577,10 @@ class RuntimeInferShapeContext : public InferShapeContext {
     }
     const auto& in = it->second;
     if (in.size() == 0) return false;
-    PADDLE_ENFORCE_EQ(in.size(), 1UL,
-                      "Input %s should not have more than one inputs", name);
+    PADDLE_ENFORCE_EQ(
+        in.size(), 1UL,
+        platform::errors::InvalidArgument(
+            "Input %s should not contain more than one inputs.", name));
     return in[0] != nullptr;
   }
 
@@ -574,8 +595,10 @@ class RuntimeInferShapeContext : public InferShapeContext {
     if (out.size() == 0) {
       return false;
     }
-    PADDLE_ENFORCE_EQ(out.size(), 1UL,
-                      "Output %s should not have more than one outputs", name);
+    PADDLE_ENFORCE_EQ(
+        out.size(), 1UL,
+        platform::errors::InvalidArgument(
+            "Output %s should not contain more than one outputs.", name));
     return out[0] != nullptr;
   }
 
@@ -644,16 +667,31 @@ class RuntimeInferShapeContext : public InferShapeContext {
                 size_t j = 0) override {
     auto in_it = ctx_.inputs.find(in);
     auto out_it = ctx_.outputs.find(out);
-    PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i,
-                   "Inputs %s should have %llu argument", in, i);
-    PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j,
-                   "Outputs %s should have %llu argument", out, j);
+    PADDLE_ENFORCE_NE(
+        in_it, ctx_.inputs.end(),
+        platform::errors::NotFound("Input %s does not exist.", in));
+    PADDLE_ENFORCE_NE(
+        out_it, ctx_.outputs.end(),
+        platform::errors::NotFound("Output %s does not exist.", out));
+    PADDLE_ENFORCE_LT(i, in_it->second.size(),
+                      platform::errors::InvalidArgument(
+                          "The index of input dimension is out of range, "
+                          "excepted index less than %zu, but received %zu.",
+                          in_it->second.size(), i));
+    PADDLE_ENFORCE_LT(j, out_it->second.size(),
+                      platform::errors::InvalidArgument(
+                          "The index of output dimension is out of range, "
+                          "excepted index less than %zu, but received %zu.",
+                          out_it->second.size(), j));
 
     Variable* in_var = in_it->second[i];
     Variable* out_var = out_it->second[j];
 
-    PADDLE_ENFORCE(in_var->Type() == out_var->Type(),
-                   "The type of %s and %s is not the same.", in, out);
+    PADDLE_ENFORCE_EQ(
+        in_var->Type(), out_var->Type(),
+        platform::errors::InvalidArgument(
+            "The type of input (%s) and output (%s) are inconsistent.", in,
+            out));
 
     if (in_var->IsType<framework::SelectedRows>()) {
       auto& in_sele_rows = in_var->Get<framework::SelectedRows>();
@@ -666,9 +704,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
       auto* out_lod_tensor = out_var->GetMutable<framework::LoDTensor>();
       out_lod_tensor->Resize(in_lod_tensor.dims());
     } else {
-      PADDLE_THROW(
+      PADDLE_THROW(platform::errors::Unimplemented(
           "Currently, the input type of ShareDim only can be LoDTensor "
-          "or SelectedRows.");
+          "or SelectedRows."));
     }
   }
 
@@ -721,16 +759,30 @@ class RuntimeInferShapeContext : public InferShapeContext {
                 size_t j = 0) const override {
     auto in_it = ctx_.inputs.find(in);
     auto out_it = ctx_.outputs.find(out);
-    PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i,
-                   "Inputs %s should have %llu argument", in, i);
-    PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j,
-                   "Outputs %s should have %llu argument", out, j);
+    PADDLE_ENFORCE_NE(
+        in_it, ctx_.inputs.end(),
+        platform::errors::NotFound("Input %s does not exist.", in));
+    PADDLE_ENFORCE_NE(
+        out_it, ctx_.outputs.end(),
+        platform::errors::NotFound("Output %s does not exist.", out));
+    PADDLE_ENFORCE_LT(i, in_it->second.size(),
+                      platform::errors::InvalidArgument(
+                          "The index of input dimension is out of range, "
+                          "excepted index less than %zu, but received %zu.",
+                          in_it->second.size(), i));
+    PADDLE_ENFORCE_LT(j, out_it->second.size(),
+                      platform::errors::InvalidArgument(
+                          "The index of output dimension is out of range, "
+                          "excepted index less than %zu, but received %zu.",
+                          out_it->second.size(), j));
 
     Variable* in_var = in_it->second.at(i);
     if (!in_var->IsType<LoDTensor>()) return;
     Variable* out_var = out_it->second.at(j);
-    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
-                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
+    PADDLE_ENFORCE_EQ(
+        out_var->IsType<LoDTensor>(), true,
+        platform::errors::InvalidArgument(
+            "The %zu-th output of Output(%s) must be LoDTensor.", j, out));
     auto& in_tensor = in_var->Get<LoDTensor>();
     auto* out_tensor = out_var->GetMutable<LoDTensor>();
     out_tensor->set_lod(in_tensor.lod());
@@ -757,18 +809,18 @@ class RuntimeInferShapeContext : public InferShapeContext {
   }
 
   int32_t GetLoDLevel(const std::string& in, size_t i = 0) const override {
-    PADDLE_THROW(
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
         "GetLoDLevel is only used in compile time. The calculation of "
         "output's actual lod is different among operators so that should be "
-        "set in the runtime kernel.");
+        "set in the runtime kernel."));
   }
 
   void SetLoDLevel(const std::string& out, int32_t lod_level,
                    size_t j = 0) const override {
-    PADDLE_THROW(
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
         "SetLoDLevel is only used in compile time. The calculation of "
         "output's actual lod is different among operators so that should be "
-        "set in the runtime kernel.");
+        "set in the runtime kernel."));
   }
 
   bool IsRuntime() const override { return true; }
@@ -794,9 +846,11 @@ class RuntimeInferShapeContext : public InferShapeContext {
 
   DDim GetInputDim(const std::string& name) const override {
     const std::vector<Variable*>& vars = InputVars(name);
-    PADDLE_ENFORCE_EQ(vars.size(), 1UL,
-                      "Input(%s) should hold one element, but now it holds %d",
-                      name, vars.size());
+    PADDLE_ENFORCE_EQ(
+        vars.size(), 1UL,
+        platform::errors::InvalidArgument(
+            "Input(%s) should hold one element, but now it holds %zu elements.",
+            name, vars.size()));
     return this->GetDim(vars[0]);
   }
 
@@ -817,9 +871,11 @@ class RuntimeInferShapeContext : public InferShapeContext {
 
   void SetOutputDim(const std::string& name, const DDim& dim) override {
     auto& vars = OutputVars(name);
-    PADDLE_ENFORCE_EQ(vars.size(), 1UL,
-                      "Output(%s) should hold one element, but now it holds %d",
-                      name, vars.size());
+    PADDLE_ENFORCE_EQ(
+        vars.size(), 1UL,
+        platform::errors::InvalidArgument("Output(%s) should hold one element, "
+                                          "but now it holds %zu elements.",
+                                          name, vars.size()));
     SetDim(vars[0], dim);
   }
 
@@ -831,16 +887,17 @@ class RuntimeInferShapeContext : public InferShapeContext {
 
  protected:
   DDim GetDim(Variable* var) const {
-    PADDLE_ENFORCE_NOT_NULL(var);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::InvalidArgument("Input variable is nullptr."));
     if (var->IsType<LoDTensor>()) {
       return var->Get<LoDTensor>().dims();
     } else if (var->IsType<SelectedRows>()) {
       return var->Get<SelectedRows>().GetCompleteDims();
     } else {
-      PADDLE_THROW(
-          "Only LoDTensor/SelectedRows support 'GetDim', but Variables "
-          "type_id is %s.",
-          ToTypeName(var->Type()));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Only LoDTensor or SelectedRows support 'GetDim', but input "
+          "Variable's type is %s.",
+          ToTypeName(var->Type())));
     }
   }
 
@@ -853,7 +910,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
   }
 
   std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
-    PADDLE_THROW("Only compile time support this method");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "GetRepeatedDims method only ban be used in compile time."));
   }
 
   void SetDim(Variable* var, const DDim& dim) {
@@ -862,15 +920,22 @@ class RuntimeInferShapeContext : public InferShapeContext {
     } else if (var->IsType<SelectedRows>()) {
       var->GetMutable<SelectedRows>()->set_height(dim[0]);
     } else {
-      PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
-                   ToTypeName(var->Type()));
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Variable type error, expect LoDTensor or SelectedRows, but received "
+          "(%s).",
+          ToTypeName(var->Type())));
     }
   }
 
   void SetDims(const std::vector<Variable*>& vars,
                const std::vector<DDim>& dims) {
     size_t length = vars.size();
-    PADDLE_ENFORCE_EQ(length, dims.size());
+    PADDLE_ENFORCE_EQ(length, dims.size(),
+                      platform::errors::InvalidArgument(
+                          "The number of input variables do not match the "
+                          "number of input dimensions, the number of variables "
+                          "is %zu, the number of dimensions is %zu.",
+                          length, dims.size()));
     for (size_t i = 0; i < length; ++i) {
       if (vars[i] == nullptr) {
         continue;
@@ -881,7 +946,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
 
   void SetRepeatedDims(const std::string& name,
                        const std::vector<DDim>& dims) override {
-    PADDLE_THROW("Only compile time support this method");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "SetRepeatedDims method only can be used in compile time."));
   }
 
   std::vector<proto::VarType::Type> GetVarTypes(
@@ -901,16 +967,19 @@ class RuntimeInferShapeContext : public InferShapeContext {
  private:
   const std::vector<Variable*>& InputVars(const std::string& name) const {
     auto it = ctx_.inputs.find(name);
-    PADDLE_ENFORCE(it != ctx_.inputs.end(),
-                   "Operator %s does not have the input %s.", op_.Type(), name);
+    PADDLE_ENFORCE_NE(
+        it, ctx_.inputs.end(),
+        platform::errors::NotFound(
+            "Operator (%s) does not have the input (%s).", op_.Type(), name));
     return it->second;
   }
 
   const std::vector<Variable*>& OutputVars(const std::string& name) const {
     auto it = ctx_.outputs.find(name);
-    PADDLE_ENFORCE(it != ctx_.outputs.end(),
-                   "Operator %s does not have the outputs %s.", op_.Type(),
-                   name);
+    PADDLE_ENFORCE_NE(
+        it, ctx_.outputs.end(),
+        platform::errors::NotFound(
+            "Operator (%s) does not have the outputs (%s).", op_.Type(), name));
     return it->second;
   }
 
@@ -928,10 +997,14 @@ static void CheckTensorNANOrInf(const std::string& op_type,
       tensor.type() != proto::VarType::FP64) {
     return;
   }
-  PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
-                 "Operator %s output Tensor %s contains Inf", op_type, name);
-  PADDLE_ENFORCE(!framework::TensorContainsNAN(tensor),
-                 "Operator %s output Tensor %s contains NAN", op_type, name);
+  PADDLE_ENFORCE_NE(
+      framework::TensorContainsInf(tensor), true,
+      platform::errors::Fatal("Operator %s output Tensor %s contains Inf.",
+                              op_type, name));
+  PADDLE_ENFORCE_NE(
+      framework::TensorContainsNAN(tensor), true,
+      platform::errors::Fatal("Operator %s output Tensor %s contains NAN.",
+                              op_type, name));
 }
 
 void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
@@ -1074,10 +1147,11 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
   // check if op[type] has kernel registered.
   auto& all_op_kernels = AllOpKernels();
   auto kernels_iter = all_op_kernels.find(type_);
-  if (kernels_iter == all_op_kernels.end()) {
-    PADDLE_THROW(
-        "There are no kernels which are registered in the %s operator.", type_);
-  }
+  PADDLE_ENFORCE_NE(
+      kernels_iter, all_op_kernels.end(),
+      platform::errors::Unavailable(
+          "There are no kernels which are registered in the %s operator.",
+          type_));
 
   OpKernelMap& kernels = kernels_iter->second;
 
@@ -1131,10 +1205,10 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
     kernel_iter = kernels.find(expected_kernel_key);
   }
 #endif
-  if (kernel_iter == kernels.end()) {
-    PADDLE_THROW("op %s does not have kernel for %s", type_,
-                 KernelTypeToString(expected_kernel_key));
-  }
+  PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
+                    platform::errors::NotFound(
+                        "Operator (%s) does not have kernel for %s.", type_,
+                        KernelTypeToString(expected_kernel_key)));
 
   std::lock_guard<std::mutex> lock(cache_update_mutex_);
   if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) {
@@ -1149,13 +1223,14 @@ void OperatorWithKernel::TransferInplaceVarsBack(
   for (auto& var_name : inplace_vars) {
     VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
     auto* origin_var = scope.FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(origin_var, "The var[%s] should not be nullptr.",
-                            var_name);
+    PADDLE_ENFORCE_NOT_NULL(origin_var,
+                            platform::errors::InvalidArgument(
+                                "The variable[%s] is nullptr.", var_name));
     auto* original_tensor =
         GetMutableLoDTensorOrSelectedRowsValueFromVar(origin_var);
     auto* var = transfer_scope.FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(var, "The var[%s] should not be nullptr.",
-                            var_name);
+    PADDLE_ENFORCE_NOT_NULL(var, platform::errors::InvalidArgument(
+                                     "The variable[%s] is nullptr.", var_name));
     auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var);
     auto original_dims = original_tensor->dims();
     original_tensor->ShareDataWith(*transformed_tensor);
@@ -1380,9 +1455,11 @@ proto::VarType::Type OperatorWithKernel::IndicateVarDataType(
   ParseInputDataType(ctx, name, &data_type);
   PADDLE_ENFORCE_NE(
       data_type, dafault_data_type,
-      "The Input Variable(%s) of %s Op used to determine kernel data type "
-      "is empty or not LoDTensor or SelectedRows or LoDTensorArray.",
-      name, Type());
+      platform::errors::InvalidArgument(
+          "The Input Variable(%s) of (%s) Operator used to determine kernel "
+          "data type is empty or not LoDTensor or SelectedRows or "
+          "LoDTensorArray.",
+          name, Type()));
   return data_type;
 }
 
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index c4ce627ff1f940f1625b8650b243d64af2641612..218fc8880bb276a75ed1dd71b04fcd9f387c9a54 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -495,9 +495,9 @@ TEST(IndicateVarDataTypeTest, other) {
     EXPECT_TRUE(
         ex_msg.find(
             "The Input Variable(Other) of "
-            "indicate_other_data_type_test Op used to "
+            "(indicate_other_data_type_test) Operator used to "
             "determine kernel data type "
-            "is empty or not LoDTensor or SelectedRows or LoDTensorArray") !=
+            "is empty or not LoDTensor or SelectedRows or LoDTensorArray.") !=
         std::string::npos);
   }
   ASSERT_TRUE(caught);
diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc
index d3513fb7dbed0413e61796d8a843c38fbbcf93dc..b418339bf32965a454e5b240bb728c4cb41e03ba 100644
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -20,7 +20,10 @@ namespace framework {
 
 void ReaderBase::ReadNext(std::vector<LoDTensor> *out) {
   std::lock_guard<std::mutex> lock(mu_);
-  PADDLE_ENFORCE_EQ(status_, ReaderStatus::kRunning);
+  PADDLE_ENFORCE_EQ(status_, ReaderStatus::kRunning,
+                    platform::errors::Unavailable(
+                        "The current reader has stopped running and cannot "
+                        "continue to read the next batch of data."));
   ReadNextImpl(out);
 }
 
diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h
index f8aa87519a2fc1a14765887e95c96883d7b4589f..9b74a55304077c6c13a55f36ea8cf3b6dfbe5b9c 100644
--- a/paddle/fluid/framework/rw_lock.h
+++ b/paddle/fluid/framework/rw_lock.h
@@ -32,17 +32,21 @@ struct RWLock {
   ~RWLock() { pthread_rwlock_destroy(&lock_); }
 
   inline void RDLock() {
-    PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
-                      "acquire read lock failed");
+    PADDLE_ENFORCE_EQ(
+        pthread_rwlock_rdlock(&lock_), 0,
+        platform::errors::External("The pthread failed to acquire read lock."));
   }
 
   inline void WRLock() {
     PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
-                      "acquire write lock failed");
+                      platform::errors::External(
+                          "The pthread failed to acquire write lock."));
   }
 
   inline void UNLock() {
-    PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
+    PADDLE_ENFORCE_EQ(
+        pthread_rwlock_unlock(&lock_), 0,
+        platform::errors::External("The pthread failed to unlock."));
   }
 
  private:
diff --git a/paddle/fluid/framework/save_load_util.cc b/paddle/fluid/framework/save_load_util.cc
index fbbbfd66b3d8c39d0ccaa7d998bb5c5e9860df4e..602b431995cc59ab67e1a32ac09a3557432c3539 100644
--- a/paddle/fluid/framework/save_load_util.cc
+++ b/paddle/fluid/framework/save_load_util.cc
@@ -33,7 +33,8 @@ void CheckInStreamState(std::istream& istre, size_t length) {
     VLOG(5) << "Can't read [" << length << "] from file"
             << "file seems breakem";
 
-    PADDLE_THROW("Model load error, file seems breaken");
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Model load failed, istream state error."));
   }
 }
 
@@ -58,10 +59,11 @@ size_t ReadTensorNumber(std::istream& istre) {
              sizeof(char) * tensor_number_mark.size());
   std::string str_read_tensor_number_mark(tensor_number_mark_buffer,
                                           tensor_number_mark.size());
-  PADDLE_ENFORCE_EQ(
-      tensor_number_mark, str_read_tensor_number_mark,
-      "Tensor number mark not match, expect [%s], but read from file is [%]",
-      tensor_number_mark, str_read_tensor_number_mark);
+  PADDLE_ENFORCE_EQ(tensor_number_mark, str_read_tensor_number_mark,
+                    platform::errors::InvalidArgument(
+                        "Tensor number mark does not match, expect mark is "
+                        "[%s], but the mark read from file is [%s].",
+                        tensor_number_mark, str_read_tensor_number_mark));
 
   size_t tensor_number = 0;
   istre.read(reinterpret_cast<char*>(&tensor_number), sizeof(tensor_number));
@@ -79,10 +81,11 @@ std::string ReadTensorName(std::istream& istre) {
 
   std::string str_read_tensor_name_mark(name_mark_buffer,
                                         tensor_name_mark.size());
-  PADDLE_ENFORCE_EQ(
-      tensor_name_mark, str_read_tensor_name_mark,
-      "Tensor name mark not match, expect [%s], but read from file is [%]",
-      tensor_name_mark, str_read_tensor_name_mark);
+  PADDLE_ENFORCE_EQ(tensor_name_mark, str_read_tensor_name_mark,
+                    platform::errors::InvalidArgument(
+                        "Tensor name mark does not match, expect mark is [%s], "
+                        "but the mark read from file is [%s].",
+                        tensor_name_mark, str_read_tensor_name_mark));
 
   size_t tensor_name_length = 0;
   istre.read(reinterpret_cast<char*>(&tensor_name_length),
@@ -117,16 +120,18 @@ bool SaveStaticNameListToDisk(
 
   for (size_t i = 0; i < vec_tensor_name_list.size(); ++i) {
     auto var_ptr = scope.FindVar(vec_tensor_name_list[i]);
-    PADDLE_ENFORCE_NE(
-        var_ptr, nullptr,
-        "Variable find error, when save model, can't not find vairable [%s], "
-        "Please make sure you have run StartUpProgram",
-        vec_tensor_name_list[i]);
+    PADDLE_ENFORCE_NOT_NULL(
+        var_ptr, platform::errors::NotFound("Variable (%s) is not found when "
+                                            "saving model, please make sure "
+                                            "that exe.run(startup_program) has "
+                                            "been executed.",
+                                            vec_tensor_name_list[i]));
     Tensor* tensor = var_ptr->GetMutable<LoDTensor>();
     PADDLE_ENFORCE_EQ(tensor->IsInitialized(), true,
-                      "Paramter [%s] not initialzed,"
-                      "Please make sure you have run StartUpProgram",
-                      vec_tensor_name_list[i]);
+                      platform::errors::PreconditionNotMet(
+                          "Paramter [%s] is not initialzed, please make sure "
+                          "that exe.run(startup_program) has been executed.",
+                          vec_tensor_name_list[i]));
 
     map_tensor[vec_tensor_name_list[i]] = tensor;
   }
@@ -145,9 +150,10 @@ bool SaveDygraphVarBaseListToDisk(
     Tensor* tensor = var_ptr->GetMutable<LoDTensor>();
 
     PADDLE_ENFORCE_EQ(tensor->IsInitialized(), true,
-                      "Paramter [%s] not initialzed,"
-                      "Please make sure you have run StartUpProgram",
-                      vec_var_base_list[i]->Name());
+                      platform::errors::PreconditionNotMet(
+                          "Paramter [%s] is not initialzed, please make sure "
+                          "that exe.run(startup_program) has been executed.",
+                          vec_var_base_list[i]->Name()));
 
     map_tensor[vec_var_base_list[i]->Name()] = tensor;
   }
@@ -185,34 +191,41 @@ bool LoadStaticNameListFromDisk(
 
   for (size_t i = 0; i < vec_tensor_name_list.size(); ++i) {
     auto it = map_load_tensor.find(vec_tensor_name_list[i]);
-    PADDLE_ENFORCE(it != map_load_tensor.end(),
-                   "Paramete not found in Model file, "
-                   "Can not find [%s] in model file [%s]",
-                   vec_tensor_name_list[i], file_name);
+    PADDLE_ENFORCE_NE(it, map_load_tensor.end(),
+                      platform::errors::NotFound(
+                          "Parameter (%s) not found in model file (%s).",
+                          vec_tensor_name_list[i], file_name));
     auto var_ptr = scope.FindVar(vec_tensor_name_list[i]);
 
-    PADDLE_ENFORCE_NE(
-        var_ptr, nullptr,
-        "Parameter not created, when load model, can't not find parameter [%s] "
-        "please make sure you have run StartUpProgram",
-        vec_tensor_name_list[i]);
+    PADDLE_ENFORCE_NOT_NULL(
+        var_ptr,
+        platform::errors::PreconditionNotMet(
+            "Parameter (%s) is not created when loading model, "
+            "please make sure that exe.run(startup_program) has been executed.",
+            vec_tensor_name_list[i]));
 
     Tensor* tensor = var_ptr->GetMutable<LoDTensor>();
-    PADDLE_ENFORCE_NE(tensor, nullptr,
-                      "Paramter [%s] not initialzed "
-                      "please make sure you have run startUpProgram",
-                      vec_tensor_name_list[i]);
+    PADDLE_ENFORCE_NOT_NULL(
+        tensor,
+        platform::errors::PreconditionNotMet(
+            "Paramter [%s] is not initialzed, "
+            "please make sure that exe.run(startup_program) has been executed.",
+            vec_tensor_name_list[i]));
 
     PADDLE_ENFORCE_EQ(tensor->IsInitialized(), true,
-                      "Paramter [%s] not initialzed "
-                      "please make sure you have run StartUpProgram",
-                      vec_tensor_name_list[i]);
+                      platform::errors::PreconditionNotMet(
+                          "Paramter [%s] is not initialzed, "
+                          "please make sure that exe.run(startup_program) has "
+                          "been executed.v",
+                          vec_tensor_name_list[i]));
     PADDLE_ENFORCE_EQ(
         tensor->dims(), it->second->dims(),
-        "Shape not matching: the Program requires a parameter with a shape of "
-        "(%s), "
-        "while the loaded parameter (namely [ %s ]) has a shape of  (%s).",
-        tensor->dims(), vec_tensor_name_list[i], it->second->dims());
+        platform::errors::InvalidArgument(
+            "Shape does not match, the program requires a parameter with a "
+            "shape of "
+            "(%s), while the loaded parameter (namely [ %s ]) has a shape of "
+            "(%s).",
+            tensor->dims(), vec_tensor_name_list[i], it->second->dims()));
 
     TensorCopySync(*(it->second.get()), tensor->place(), tensor);
 
@@ -239,9 +252,9 @@ bool SaveTensorToDisk(const std::string& file_name,
   MkDirRecursively(DirName(file_name).c_str());
 
   std::ofstream fout(file_name, std::ios::binary);
-  if (!fout) {
-    PADDLE_THROW("File open error. Can not open file [%s]", file_name);
-  }
+  PADDLE_ENFORCE_EQ(
+      fout.is_open(), true,
+      platform::errors::Unavailable("File (%s) open failed.", file_name));
 
   // first 256 byte for reserve for fulture upgrade
   char* kReserveBuffer = new char[model_file_reserve_size];
@@ -292,9 +305,8 @@ bool SaveTensorToDisk(const std::string& file_name,
       TensorCopySync(*tensor, platform::CPUPlace(), &temp);
       data_ptr = temp.data<void>();
 #else
-      PADDLE_THROW(
-          "Tensor is in CUDA device, but paddle not compile with CUDA, this "
-          "should not happen");
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Tensor is in CUDA device, but paddle not compiled with CUDA."));
 #endif
     }
     fout.write(static_cast<const char*>(data_ptr),
@@ -302,8 +314,9 @@ bool SaveTensorToDisk(const std::string& file_name,
   }
 
   if (!fout) {
-    PADDLE_THROW("Model save failed, data write to model file [%s] error",
-                 file_name);
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Model save failed, error when writing data into model file [%s].",
+        file_name));
   }
 
   fout.close();
@@ -316,9 +329,9 @@ bool LoadTensorFromDisk(
     std::map<std::string, std::shared_ptr<Tensor>>* map_tensor) {
   std::ifstream fin(file_name, std::ios::binary);
 
-  if (!fin) {
-    PADDLE_THROW("File open error. Can not open model file [%s]", file_name);
-  }
+  PADDLE_ENFORCE_EQ(
+      fin.is_open(), true,
+      platform::errors::Unavailable("File (%s) open failed.", file_name));
 
   ReadReserveBuffer(fin);
 
@@ -331,7 +344,8 @@ bool LoadTensorFromDisk(
     uint32_t version;
     fin.read(reinterpret_cast<char*>(&version), sizeof(version));
     CheckInStreamState(fin, sizeof(version));
-    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+    PADDLE_ENFORCE_EQ(version, 0U, platform::errors::InvalidArgument(
+                                       "Only version 0 tensor is supported."));
     proto::VarType::TensorDesc desc;
     {
       // int32_t size
@@ -344,7 +358,7 @@ bool LoadTensorFromDisk(
       CheckInStreamState(fin, sizeof(size));
       PADDLE_ENFORCE_EQ(
           desc.ParseFromArray(buf.get(), size), true,
-          platform::errors::InvalidArgument("Cannot parse tensor desc"));
+          platform::errors::InvalidArgument("Parse tensor desc failed."));
     }
 
     {  // read tensor
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 54a818250b45e593de4110f56e42a04a9ea65e00..1f402ea9dd33626a43a4d03b96256b2c2841c8b4 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -113,7 +113,9 @@ void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
     // the 1st field, unit32_t version for SelectedRows
     uint32_t version;
     is.read(reinterpret_cast<char*>(&version), sizeof(version));
-    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+    PADDLE_ENFORCE_EQ(version, 0U,
+                      platform::errors::InvalidArgument(
+                          "Only version 0 SelectedRows is supported."));
   }
   {
     // the 2st field, rows information
@@ -155,24 +157,27 @@ int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown,
   auto iter = id_to_index_.find(key);
   if (iter == id_to_index_.end()) {
     rwlock_->UNLock();
-    if (!auto_grown) {
-      PADDLE_THROW("key %d not found", key);
-    }
+    PADDLE_ENFORCE_EQ(
+        auto_grown, true,
+        platform::errors::NotFound("Input key(%lld) is not found.", key));
     rwlock_->WRLock();
     auto map_size = id_to_index_.size();
     auto vector_size = rows_.size();
     if (map_size != vector_size) {
       rwlock_->UNLock();
-      PADDLE_THROW(
-          "id_to_index_ size %d should have the same size with rows_ %d",
-          map_size, vector_size);
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Row map size(%zu) should be equal to rows size(%zu).", map_size,
+          vector_size));
     }
     auto write_iter = id_to_index_.find(key);
     if (write_iter == id_to_index_.end()) {
       int row_num = rows_.size();
       if (row_num == value_->dims()[0]) {
         rwlock_->UNLock();
-        PADDLE_THROW("selected rows is full, then length exceed %d", row_num);
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Selected rows is full, then length exceed the length of first "
+            "dimension (%d).",
+            row_num));
       }
       // key logic to put a key into id_to_index_
       rows_.push_back(key);
@@ -203,15 +208,20 @@ void SelectedRows::SyncIndex() {
 
 void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
                        bool auto_grown, bool is_test) {
-  PADDLE_ENFORCE(value->IsInitialized(),
-                 "The value tensor should be initialized.");
+  PADDLE_ENFORCE_EQ(value->IsInitialized(), true,
+                    platform::errors::InvalidArgument(
+                        "The value tensor is not initialized."));
   if (ids.numel() == 0) {
     VLOG(3) << "keys is empty, please check data!";
   } else {
     int64_t value_width = value_->numel() / value_->dims()[0];
-    PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
-                      "output tensor should have the same shape with table "
-                      "except the dims[0].");
+    PADDLE_ENFORCE_EQ(
+        value_width, value->numel() / value->dims()[0],
+        platform::errors::InvalidArgument(
+            "Output tensor should have the same shape with table "
+            "except the first dimmension, excepted value width not counting "
+            "the first dimension is %d, actual value width is %d.",
+            value_width, value->numel() / value->dims()[0]));
     for (int i = 0; i < ids.numel(); ++i) {
       auto id = ids.data<int64_t>()[i];
       int64_t index = AutoGrownIndex(id, auto_grown, is_test);
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index 5f733139419dbc1769d9eb4efe7e793f8fb2752f..285af1d55302a49cae058fccdd5edd13aa28137e 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -82,7 +82,8 @@ class SelectedRows {
   int64_t Index(int64_t key) const {
     auto it = std::find(rows_.begin(), rows_.end(), key);
     if (it == rows_.end()) {
-      PADDLE_THROW("id %s not in table", key);
+      PADDLE_THROW(platform::errors::NotFound(
+          "Input id (%lld) is not in current rows table.", key));
     }
     return static_cast<int64_t>(std::distance(rows_.begin(), it));
   }
diff --git a/paddle/fluid/framework/shape_inference.cc b/paddle/fluid/framework/shape_inference.cc
index 4ac872ac3d3bf918678f5294a4c35097c3fb18ab..f5bb3f68007043ad37ea32e7047c5fc546b80931 100644
--- a/paddle/fluid/framework/shape_inference.cc
+++ b/paddle/fluid/framework/shape_inference.cc
@@ -25,20 +25,22 @@ namespace framework {
 std::vector<DDim> InferShapeContext::GetReaderDims(
     const std::string &name) const {
   const std::vector<std::string> &arg_names = Inputs(name);
-  PADDLE_ENFORCE_EQ(
-      arg_names.size(), 1UL,
-      "Reader input '%s' should hold one element, but now it holds %d", name,
-      arg_names.size());
+  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+                    platform::errors::InvalidArgument(
+                        "Reader input '%s' should hold one element, but now it "
+                        "holds %d elements.",
+                        name, arg_names.size()));
   return this->GetRepeatedDims(arg_names[0]);
 }
 
 void InferShapeContext::SetReaderDims(const std::string &name,
                                       const std::vector<DDim> &dims) {
   const std::vector<std::string> &arg_names = Outputs(name);
-  PADDLE_ENFORCE_EQ(
-      arg_names.size(), 1UL,
-      "Reader output '%s' should hold one element, but now it holds %d", name,
-      arg_names.size());
+  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+                    platform::errors::InvalidArgument(
+                        "Reader output '%s' should hold one element, but now "
+                        "it holds %d elements.",
+                        name, arg_names.size()));
   return this->SetRepeatedDims(arg_names[0], dims);
 }
 
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index 544c014eaf98a99b1737809f2cbad39b46fdb276..0b22bab26789a3e2ebd20428adc236faa8b38dee 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -19,13 +19,17 @@ namespace paddle {
 namespace framework {
 extern size_t SizeOfType(proto::VarType::Type type);
 void Tensor::check_memory_size() const {
-  PADDLE_ENFORCE_NOT_NULL(
-      holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
+  PADDLE_ENFORCE_NOT_NULL(holder_, platform::errors::PreconditionNotMet(
+                                       "Tensor holds no memory. "
+                                       "Call Tensor::mutable_data firstly."));
   PADDLE_ENFORCE_LE(
       numel() * SizeOfType(type()), memory_size(),
-      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
-      "first to re-allocate memory.\n"
-      "or maybe the required data-type mismatches the data already stored.");
+      platform::errors::PreconditionNotMet(
+          "Tensor's dimension is out of bound."
+          "Tensor's dimension must be equal or less than the size of its "
+          "memory."
+          "But received  Tensor's dimension is d%, memory's size is %d.",
+          numel() * SizeOfType(type()), memory_size()));
 }
 
 Tensor::Tensor(const proto::VarType::Type& dtype) : type_(dtype), offset_(0) {}
@@ -37,15 +41,21 @@ size_t Tensor::memory_size() const {
 void* Tensor::mutable_data(const platform::Place& place,
                            proto::VarType::Type type, size_t requested_size) {
   type_ = type;
-  PADDLE_ENFORCE_GE(numel(), 0,
-                    "When calling this method, the Tensor's numel must be "
-                    "equal or larger than zero. "
-                    "Please check Tensor::dims, or Tensor::Resize has been "
-                    "called first. The Tensor's shape is [",
-                    dims(), "] now");
+  PADDLE_ENFORCE_GE(
+      numel(), 0,
+      platform::errors::PreconditionNotMet(
+          "The Tensor's element number must be equal or greater than zero. "
+          "The Tensor's shape is [",
+          dims(), "] now"));
   size_t size = numel() * SizeOfType(type);
   if (requested_size) {
-    PADDLE_ENFORCE_GE(requested_size, size);
+    PADDLE_ENFORCE_GE(
+        requested_size, size,
+        platform::errors::InvalidArgument(
+            "The requested memory size is less than the memory size of Tensor. "
+            "But received requested memory size is d%, "
+            "memory size of Tensor is %d.",
+            requested_size, size));
     size = requested_size;
   }
   /* some versions of boost::variant don't have operator!= */
@@ -62,8 +72,8 @@ void* Tensor::mutable_data(const platform::Place& place,
 
 void* Tensor::mutable_data(const platform::Place& place,
                            size_t requested_size) {
-  PADDLE_ENFORCE_NOT_NULL(
-      this->holder_, "Cannot invoke mutable data if current hold nothing.");
+  PADDLE_ENFORCE_NOT_NULL(this->holder_, platform::errors::PreconditionNotMet(
+                                             "The tensor is not initialized."));
   return mutable_data(place, type_, requested_size);
 }
 
@@ -75,12 +85,20 @@ Tensor& Tensor::ShareDataWith(const Tensor& src) {
 
 Tensor Tensor::Slice(int64_t begin_idx, int64_t end_idx) const {
   check_memory_size();
-  PADDLE_ENFORCE_GE(begin_idx, 0,
-                    "The start row index must be greater than 0.");
-  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
+  PADDLE_ENFORCE_GE(
+      begin_idx, 0,
+      platform::errors::OutOfRange("The start row index must be greater than 0."
+                                   "But received the start index is d%.",
+                                   begin_idx));
+  PADDLE_ENFORCE_LE(
+      end_idx, dims_[0],
+      platform::errors::OutOfRange("The end row index is out of bound."));
   PADDLE_ENFORCE_LT(
       begin_idx, end_idx,
-      "The start row index must be lesser than the end row index.");
+      platform::errors::InvalidArgument(
+          "The start row index must be less than the end row index."
+          "But received the start index = %d, the end index = %d.",
+          begin_idx, end_idx));
 
   if (dims_[0] == 1) {
     return *this;
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index d9fddc4c77d99efcb119ae37591b26bff0e51c0a..f2ccff2c133a238d02e25c65faf41dd519fdb506 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -131,13 +131,17 @@ class Tensor {
 
   const platform::Place& place() const {
     PADDLE_ENFORCE_NOT_NULL(
-        holder_, "Tensor not initialized yet when Tensor::place() is called.");
+        holder_,
+        platform::errors::PreconditionNotMet(
+            "Tensor not initialized yet when Tensor::place() is called."));
     return holder_->place();
   }
 
   proto::VarType::Type type() const {
     PADDLE_ENFORCE_NOT_NULL(
-        holder_, "Tensor not initialized yet when Tensor::type() is called.");
+        holder_,
+        platform::errors::PreconditionNotMet(
+            "Tensor not initialized yet when Tensor::type() is called."));
     return type_;
   }
 
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index f5171b0a8d1efc33521c2d731e88f0f96bdf41c7..986551b935e8811a6b257c2b4a613a493b3b644b 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -43,9 +43,13 @@ inline T* Tensor::data() {
   check_memory_size();
   bool valid =
       std::is_same<T, void>::value || type_ == DataTypeTrait<T>::DataType();
-  PADDLE_ENFORCE(
-      valid, "Tensor holds the wrong type, it holds %s, but desires to be %s",
-      DataTypeToString(type_), DataTypeToString(DataTypeTrait<T>::DataType()));
+  PADDLE_ENFORCE_EQ(
+      valid, true,
+      platform::errors::InvalidArgument(
+          "Tensor holds the wrong type, it holds %s, but desires to be %s",
+          DataTypeToString(type_),
+          DataTypeToString(DataTypeTrait<T>::DataType())));
+
   return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                               offset_);
 }
@@ -69,9 +73,12 @@ inline T* Tensor::mutable_data(const platform::Place& place,
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
   int rank = src.dims().size();
   PADDLE_ENFORCE_GE(
-      rank, 2,
-      "'ReshapeToMatrix()' is only used for flatten high rank "
-      "tensors to matrixs. Can not be used in reshaping vectors.");
+      rank, 2, platform::errors::InvalidArgument(
+                   "'ReshapeToMatrix()' is only used for flatten high rank "
+                   "tensors to matrixs. The dimensions of Tensor must be "
+                   "greater or equal than 2. "
+                   "But received dimensions of Tensor is %d",
+                   rank));
   if (rank == 2) {
     return src;
   }
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index 84f98d339a29b7d23de6e8be4b069b216be31ab2..cc972dd93d032c19015c86debebc27f7c8c0d155 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -41,7 +41,7 @@ TEST(Tensor, DataAssert) {
     std::string ex_msg = err.what();
     EXPECT_TRUE(ex_msg.find("holder_ should not be null") != std::string::npos);
     EXPECT_TRUE(ex_msg.find("Tensor holds no memory. Call "
-                            "Tensor::mutable_data first.") !=
+                            "Tensor::mutable_data firstly.") !=
                 std::string::npos);
   }
   ASSERT_TRUE(caught);
@@ -157,7 +157,7 @@ TEST(Tensor, ShareDataWith) {
       EXPECT_TRUE(ex_msg.find("holder_ should not be null") !=
                   std::string::npos);
       EXPECT_TRUE(ex_msg.find("Tensor holds no memory. Call "
-                              "Tensor::mutable_data first.") !=
+                              "Tensor::mutable_data firstly.") !=
                   std::string::npos);
     }
     ASSERT_TRUE(caught);
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index c3626c5c9e0506f12ca77aac5086cb18e272a771..0e3d11b9f0257905cbede334afd0ad84ff15cb5c 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -94,9 +94,17 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
     auto dst_cpu_place = BOOST_GET_CONST(platform::CPUPlace, dst_place);
     auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true);
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx_place), true,
+        platform::errors::PreconditionNotMet(
+            "Context place error, excepted GPUPlace, but actually %s.",
+            ctx_place));
     auto ctx_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx_place);
-    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place,
+                      platform::errors::Unavailable(
+                          "Source place and context place do not match, source "
+                          "place is %s, context place is %s.",
+                          src_gpu_place, ctx_gpu_place));
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
@@ -106,9 +114,17 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     auto src_cpu_place = BOOST_GET_CONST(platform::CPUPlace, src_place);
     auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
     auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true);
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx_place), true,
+        platform::errors::PreconditionNotMet(
+            "Context place error, excepted GPUPlace, but actually %s.",
+            ctx_place));
     auto ctx_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx_place);
-    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
+    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place,
+                      platform::errors::Unavailable(
+                          "Destination place and context place do not match, "
+                          "destination place is %s, context place is %s.",
+                          dst_gpu_place, ctx_gpu_place));
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
@@ -164,7 +180,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
     auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
     auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true);
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx_place), true,
+        platform::errors::PreconditionNotMet(
+            "Context place error, excepted GPUPlace, but actually %s.",
+            ctx_place));
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     if (platform::is_same_place(src_place, dst_place)) {
@@ -180,12 +200,14 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
         memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
                      stream);
       } else {
-        PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place.");
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Context place dose not match the source and destination place."));
       }
     }
   }
   else {  // NOLINT
-    PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place);
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copying from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
 }
@@ -298,7 +320,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
                  nullptr);
   }
   else {  // NOLINT
-    PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place);
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
 }
@@ -832,7 +855,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
 void* GetDstPtrByDLDataType(DLDataType type, framework::Tensor* dst,
                             const platform::Place& dst_place) {
   // vector types not currently supported
-  PADDLE_ENFORCE_LE(type.lanes, 1, "vector types not currently supported");
+  PADDLE_ENFORCE_LE(type.lanes, 1,
+                    platform::errors::Unimplemented(
+                        "Vector type is not supported currently."));
 
   switch (type.bits) {
     case 8:
@@ -840,32 +865,37 @@ void* GetDstPtrByDLDataType(DLDataType type, framework::Tensor* dst,
         return static_cast<void*>(dst->mutable_data<int8_t>(dst_place));
       if (type.code == kDLUInt)
         return static_cast<void*>(dst->mutable_data<uint8_t>(dst_place));
-      PADDLE_THROW("There is no this type.code <%d> when type.bits is <%d>.",
-                   type.code, type.bits);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
+          type.code, type.bits));
     case 16:
       if (type.code == kDLInt)
         return static_cast<void*>(dst->mutable_data<int16_t>(dst_place));
       if (type.code == kDLFloat)
         return static_cast<void*>(
             dst->mutable_data<paddle::platform::float16>(dst_place));
-      PADDLE_THROW("There is no this type.code <%d> when type.bits is <%d>.",
-                   type.code, type.bits);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
+          type.code, type.bits));
     case 32:
       if (type.code == kDLInt)
         return static_cast<void*>(dst->mutable_data<int32_t>(dst_place));
       if (type.code == kDLFloat)
         return static_cast<void*>(dst->mutable_data<float>(dst_place));
-      PADDLE_THROW("There is no this type.code <%d> when type.bits is <%d>.",
-                   type.code, type.bits);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
+          type.code, type.bits));
     case 64:
       if (type.code == kDLInt)
         return static_cast<void*>(dst->mutable_data<int64_t>(dst_place));
       if (type.code == kDLFloat)
         return static_cast<void*>(dst->mutable_data<double>(dst_place));
-      PADDLE_THROW("There is no this type.code <%d> when type.bits is <%d>.",
-                   type.code, type.bits);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
+          type.code, type.bits));
     default:
-      PADDLE_THROW("Unsupport type.bits %d", type.bits);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported DLDataType.bits %d.", type.bits));
   }
 }
 
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index fce0142b41d3ae9b2a6fcd4f16d38b0492fbd806..a0408dbc3dbb4ffca70ef322d93b662f1b953f7b 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -183,7 +183,11 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
   dst->resize(src.numel());
   auto dst_ptr = static_cast<void*>(dst->data());
 
-  PADDLE_ENFORCE_EQ(platform::is_cpu_place(src.place()), true);
+  PADDLE_ENFORCE_EQ(
+      platform::is_cpu_place(src.place()), true,
+      platform::errors::InvalidArgument(
+          "The input tensor should be CPU device, but actually it is in %s.",
+          src.place()));
 
   memory::Copy(dst_place, dst_ptr,
                BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size);
diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc
index 7f7f426d0e28224932fc96a3fefa0df1279e6475..4682bfc264b68997abd0a87233c5ed39e7e50a63 100644
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -42,7 +42,8 @@ void ThreadPool::Init() {
       num_threads = FLAGS_dist_threadpool_size;
       VLOG(1) << "set dist_threadpool_size to " << num_threads;
     }
-    PADDLE_ENFORCE_GT(num_threads, 0);
+    PADDLE_ENFORCE_GT(num_threads, 0, platform::errors::InvalidArgument(
+                                          "The number of threads is 0."));
     threadpool_.reset(new ThreadPool(num_threads));
   }
 }
@@ -83,7 +84,8 @@ void ThreadPool::TaskLoop() {
       }
 
       if (tasks_.empty()) {
-        PADDLE_THROW("This thread has no task to Run");
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Current thread has no task to Run."));
       }
 
       // pop a task from the task queue
diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h
index 654d81116b280bb6a52af3f83aeec284387f3b63..09528b6fc35bf49ac3110440a62aba3200341e15 100644
--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@@ -91,7 +91,8 @@ class ThreadPool {
     {
       std::unique_lock<std::mutex> lock(mutex_);
       if (!running_) {
-        PADDLE_THROW("enqueue on stopped ThreadPool");
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Task is enqueued into stopped ThreadPool."));
       }
       tasks_.push(std::move(task));
     }
diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc
index f3ea1f624ee836a483c37c2addb4d9766e87c107..2ee0b17b64b6df7a2f66b208f5b5879683db6656 100644
--- a/paddle/fluid/framework/var_desc.cc
+++ b/paddle/fluid/framework/var_desc.cc
@@ -43,8 +43,9 @@ void VarDesc::SetTensorDescNum(size_t num) {
     } break;
     default:
       PADDLE_THROW(
-          "Setting 'sub_tensor_number' is not supported by the type of var %s.",
-          this->Name());
+          platform::errors::Unavailable("Setting 'sub_tensor_number' is not "
+                                        "supported by the %s type variable.",
+                                        this->Name()));
   }
 }
 
@@ -55,8 +56,9 @@ size_t VarDesc::GetTensorDescNum() const {
       break;
     default:
       PADDLE_THROW(
-          "Getting 'sub_tensor_number' is not supported by the type of var %s.",
-          this->Name());
+          platform::errors::Unavailable("Getting 'sub_tensor_number' is not "
+                                        "supported by the %s type variable.",
+                                        this->Name()));
   }
 }
 
@@ -133,9 +135,9 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {
       desc_.mutable_type()->mutable_tensor_array()->set_lod_level(lod_level);
       break;
     default:
-      PADDLE_THROW(
-          "Setting 'lod_level' is not supported by the type of var %s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Setting 'lod_level' is not supported by the %s type variable.",
+          this->Name()));
   }
 }
 
@@ -157,9 +159,9 @@ void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
       }
     } break;
     default:
-      PADDLE_THROW(
-          "Setting 'lod_levels' is not supported by the type of var %s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Setting 'lod_levels' is not supported by the %s type variable",
+          this->Name()));
   }
 }
 
@@ -170,9 +172,9 @@ int32_t VarDesc::GetLoDLevel() const {
     case proto::VarType::LOD_TENSOR_ARRAY:
       return desc_.type().tensor_array().lod_level();
     default:
-      PADDLE_THROW(
-          "Getting 'lod_level' is not supported by the type of var %s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Getting 'lod_level' is not supported by the %s type variable.",
+          this->Name()));
   }
 }
 
@@ -187,15 +189,19 @@ std::vector<int32_t> VarDesc::GetLoDLevels() const {
       return res;
       break;
     default:
-      PADDLE_THROW(
-          "Getting 'lod_levels' is not supported by the type of var %s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Getting 'lod_levels' is not supported by the %s type variable.",
+          this->Name()));
   }
 }
 
 const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
-  PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set.");
-  PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
+  PADDLE_ENFORCE_EQ(
+      desc_.has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
+  PADDLE_ENFORCE_EQ(
+      desc_.type().has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
   switch (desc_.type().type()) {
     case proto::VarType::SELECTED_ROWS:
       return desc_.type().selected_rows();
@@ -204,14 +210,16 @@ const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
     case proto::VarType::LOD_TENSOR_ARRAY:
       return desc_.type().tensor_array().tensor();
     default:
-      PADDLE_THROW(
-          "Getting 'tensor_desc' is not supported by the type of var %s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Getting 'tensor_desc' is not supported by the %s type variable.",
+          this->Name()));
   }
 }
 
 std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
-  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
+  PADDLE_ENFORCE_EQ(
+      desc_.has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
   std::vector<proto::VarType::TensorDesc> res;
   res.reserve(GetTensorDescNum());
   switch (desc_.type().type()) {
@@ -221,16 +229,19 @@ std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
       }
       return res;
     default:
-      PADDLE_THROW(
-          "Getting 'tensor_descs' is not supported by the type of var "
-          "%s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Getting 'tensor_descs' is not supported by the %s type variable.",
+          this->Name()));
   }
 }
 
 proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
-  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
-  PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
+  PADDLE_ENFORCE_EQ(
+      desc_.has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
+  PADDLE_ENFORCE_EQ(
+      desc_.type().has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
   switch (desc_.type().type()) {
     case proto::VarType::SELECTED_ROWS:
       return desc_.mutable_type()->mutable_selected_rows();
@@ -240,15 +251,19 @@ proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
       return desc_.mutable_type()->mutable_tensor_array()->mutable_tensor();
     default:
       PADDLE_THROW(
-          "Getting 'mutable_tensor_desc' is not supported by the type of var "
-          "%s.",
-          this->Name());
+          platform::errors::Unavailable("Getting 'mutable_tensor_desc' is not "
+                                        "supported by the %s type variable.",
+                                        this->Name()));
   }
 }
 
 std::vector<proto::VarType::TensorDesc *> VarDesc::mutable_tensor_descs() {
-  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
-  PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
+  PADDLE_ENFORCE_EQ(
+      desc_.has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
+  PADDLE_ENFORCE_EQ(
+      desc_.type().has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
   std::vector<proto::VarType::TensorDesc *> res;
   res.reserve(GetTensorDescNum());
   switch (desc_.type().type()) {
@@ -259,10 +274,9 @@ std::vector<proto::VarType::TensorDesc *> VarDesc::mutable_tensor_descs() {
       }
       return res;
     default:
-      PADDLE_THROW(
-          "Getting 'tensor_descs' is not supported by the type of var "
-          "%s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Getting 'tensor_descs' is not supported by the %s type variable.",
+          this->Name()));
   }
 }
 
diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h
index 43e9ed553bea84aaaaa18a46fe81f06a18b124af..8affeda67b3d07d67ceed2b657b285210e1bd076 100644
--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
@@ -40,7 +40,8 @@ inline proto::VarType::Type ToVarType(int type) {
     case proto::VarType::READER:
       return static_cast<proto::VarType::Type>(type);
     default:
-      PADDLE_THROW("ToVarType:Unsupported type %d", type);
+      PADDLE_THROW(platform::errors::Unavailable(
+          "ToVarType method Unsupported type %d.", type));
   }
 }
 
@@ -66,7 +67,8 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
       visitor(var.Get<FetchList>());
       return;
     default:
-      PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type()));
+      PADDLE_THROW(platform::errors::Unavailable("Not supported visit type %s.",
+                                                 ToTypeName(var.Type())));
   }
 }
 
diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc
index 5c90b07149ec5575f9907e41cc65a826421cf3ec..1e5e8d657556059bae8129e7c0b6ea6b57cbf63f 100644
--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@@ -46,12 +46,14 @@ struct VarIdToTypeIndexMapInitializerImpl {
     static_assert(!std::is_same<Type, void>::value, "Type cannot be void");
     constexpr int kId = VarTypeTrait<Type>::kId;
     auto type = std::type_index(typeid(Type));
-    PADDLE_ENFORCE(id_to_type->count(kId) == 0,
-                   "Registered duplicate type id %d for type %s", kId,
-                   type.name());
-    PADDLE_ENFORCE(type_to_id->count(type) == 0,
-                   "Registered duplicate type_index %s for id %d", type.name(),
-                   kId);
+    PADDLE_ENFORCE_EQ(
+        id_to_type->count(kId), 0,
+        platform::errors::AlreadyExists(
+            "Registered duplicate type id %d for type %s.", kId, type.name()));
+    PADDLE_ENFORCE_EQ(
+        type_to_id->count(type), 0,
+        platform::errors::AlreadyExists(
+            "Registered duplicate type index %s for id %d.", type.name(), kId));
     id_to_type->emplace(kId, type);
     type_to_id->emplace(type, kId);
     VarIdToTypeIndexMapInitializerImpl<kStart + 1, kEnd,
@@ -79,15 +81,17 @@ struct VarIdToTypeIndexMapHolder {
  public:
   static const std::type_index &ToTypeIndex(int var_id) {
     auto it = Instance().id_to_type_map_.find(var_id);
-    PADDLE_ENFORCE(it != Instance().id_to_type_map_.end(),
-                   "VarId %d is not registered.", var_id);
+    PADDLE_ENFORCE_NE(it, Instance().id_to_type_map_.end(),
+                      platform::errors::NotFound(
+                          "Variable Id %d is not registered.", var_id));
     return it->second;
   }
 
   static int ToTypeId(const std::type_index &type) {
     auto it = Instance().type_to_id_map_.find(type);
-    PADDLE_ENFORCE(it != Instance().type_to_id_map_.end(),
-                   "VarType %s is not registered.", type.name());
+    PADDLE_ENFORCE_NE(it, Instance().type_to_id_map_.end(),
+                      platform::errors::NotFound(
+                          "Variable Type %s is not registered.", type.name()));
     return it->second;
   }
 
diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc
index 67e17410a29aff435921f46eeb2691a025d5a9eb..ec42aa30e5abb3dc3d03633cae31d95999d82731 100644
--- a/paddle/fluid/framework/variable_helper.cc
+++ b/paddle/fluid/framework/variable_helper.cc
@@ -50,11 +50,11 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
   } else if (var_type == proto::VarType::RAW) {
     // GetMutable will be called in operator
   } else {
-    PADDLE_THROW(
+    PADDLE_THROW(platform::errors::Unavailable(
         "Variable type %d is not in "
         "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]",
-        var_type);
+        "LOD_RANK_TABLE, PLACE_LIST, READER, RAW].",
+        var_type));
   }
 }
 
@@ -76,7 +76,8 @@ void CopyVariable(const Variable &src_var, Variable *dst_var) {
     auto *dst_t = tmp_grad_slr->mutable_value();
     framework::TensorCopy(src_t, cpu_place, dst_t);
   } else {
-    PADDLE_THROW("unknown var type to copy");
+    PADDLE_THROW(
+        platform::errors::Unavailable("Unknown variable type to copy."));
   }
 }
 
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index d6d0371edaa78cde603a7f7d77473682be57df31..be7d6ab868022b5e9e1f073aad441decba0dbf00 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -27,8 +27,9 @@ Analyzer::Analyzer() {}
 void Analyzer::Run(Argument *argument) { RunAnalysis(argument); }
 
 void Analyzer::RunAnalysis(Argument *argument) {
-  PADDLE_ENFORCE(argument->analysis_passes_valid(),
-                 "analsis_passes is not valid in the argument.");
+  PADDLE_ENFORCE_EQ(argument->analysis_passes_valid(), true,
+                    platform::errors::InvalidArgument(
+                        "analsis_passes is not valid in the argument."));
   const bool disable_logs = argument->disable_logs();
   for (auto &pass : argument->analysis_passes()) {
     if (!disable_logs) {
@@ -38,7 +39,8 @@ void Analyzer::RunAnalysis(Argument *argument) {
       continue;
 
     auto *ptr = PassRegistry::Global().Retreive(pass);
-    PADDLE_ENFORCE_NOT_NULL(ptr, "no analysis pass called %s", pass);
+    PADDLE_ENFORCE_NOT_NULL(ptr, platform::errors::PreconditionNotMet(
+                                     "no analysis pass called %s", pass));
     ptr->Run(argument);
   }
 }
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 79784fcb9bf31e8fac972053b1a4ec6180d45afa..135ef6a970621cea6ee1418f751ffc37406628db 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -75,9 +75,14 @@ void TestWord2vecPrediction(const std::string& model_path) {
   std::vector<PaddleTensor> outputs;
   CHECK(predictor->Run(slots, &outputs));
 
-  PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
+  PADDLE_ENFORCE_EQ(outputs.size(), 1UL,
+                    platform::errors::PreconditionNotMet(
+                        "Output size should be 1, but got %d", outputs.size()));
   // Check the output buffer size and result of each tid.
-  PADDLE_ENFORCE_EQ(outputs.front().data.length(), 33168UL);
+  PADDLE_ENFORCE_EQ(outputs.front().data.length(), 33168UL,
+                    platform::errors::PreconditionNotMet(
+                        "Output's data length should be 33168 but got %d",
+                        outputs.front().data.length()));
   float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
                      0.000932706};
   const size_t num_elements = outputs.front().data.length() / sizeof(float);
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 27bae7a71ea192ac08e4e87cb7bcdb8b84e29dc8..40ca3e85868fbbba19d81336aed1a8cbee58ec54 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -76,53 +76,62 @@ struct Argument {
     }
   }
 
-#define DECL_ARGUMENT_FIELD(field__, Field, type__)          \
- public:                                                     \
-  type__& field__() {                                        \
-    PADDLE_ENFORCE(Has(#field__), "There is no such field"); \
-    return field__##_;                                       \
-  }                                                          \
-  void Set##Field(const type__& x) {                         \
-    field__##_ = x;                                          \
-    valid_fields_.insert(#field__);                          \
-  }                                                          \
-  DECL_ARGUMENT_FIELD_VALID(field__);                        \
-  type__* field__##_ptr() { return &field__##_; }            \
-                                                             \
- private:                                                    \
+#define DECL_ARGUMENT_FIELD(field__, Field, type__)                      \
+ public:                                                                 \
+  type__& field__() {                                                    \
+    PADDLE_ENFORCE_EQ(                                                   \
+        Has(#field__), true,                                             \
+        platform::errors::PreconditionNotMet("There is no such field")); \
+    return field__##_;                                                   \
+  }                                                                      \
+  void Set##Field(const type__& x) {                                     \
+    field__##_ = x;                                                      \
+    valid_fields_.insert(#field__);                                      \
+  }                                                                      \
+  DECL_ARGUMENT_FIELD_VALID(field__);                                    \
+  type__* field__##_ptr() { return &field__##_; }                        \
+                                                                         \
+ private:                                                                \
   type__ field__##_;
 
 #define DECL_ARGUMENT_FIELD_VALID(field__) \
   bool field__##_valid() { return Has(#field__); }
 
-#define DECL_ARGUMENT_UNIQUE_FIELD(field__, Field, type__)                \
- public:                                                                  \
-  type__& field__() {                                                     \
-    PADDLE_ENFORCE_NOT_NULL(field__##_);                                  \
-    PADDLE_ENFORCE(Has(#field__));                                        \
-    return *static_cast<type__*>(field__##_.get());                       \
-  }                                                                       \
-  void Set##Field(type__* x) {                                            \
-    field__##_ =                                                          \
-        unique_ptr_t(x, [](void* x) { delete static_cast<type__*>(x); }); \
-    valid_fields_.insert(#field__);                                       \
-  }                                                                       \
-  void Set##Field##NotOwned(type__* x) {                                  \
-    valid_fields_.insert(#field__);                                       \
-    field__##_ = unique_ptr_t(x, [](void* x) {});                         \
-  }                                                                       \
-  DECL_ARGUMENT_FIELD_VALID(field__);                                     \
-  type__* field__##_ptr() {                                               \
-    PADDLE_ENFORCE(Has(#field__));                                        \
-    return static_cast<type__*>(field__##_.get());                        \
-  }                                                                       \
-  type__* Release##Field() {                                              \
-    PADDLE_ENFORCE(Has(#field__));                                        \
-    valid_fields_.erase(#field__);                                        \
-    return static_cast<type__*>(field__##_.release());                    \
-  }                                                                       \
-                                                                          \
- private:                                                                 \
+#define DECL_ARGUMENT_UNIQUE_FIELD(field__, Field, type__)                    \
+ public:                                                                      \
+  type__& field__() {                                                         \
+    PADDLE_ENFORCE_NOT_NULL(field__##_, platform::errors::PreconditionNotMet( \
+                                            "filed should not be null."));    \
+    PADDLE_ENFORCE_EQ(                                                        \
+        Has(#field__), true,                                                  \
+        platform::errors::PreconditionNotMet("There is no such field"));      \
+    return *static_cast<type__*>(field__##_.get());                           \
+  }                                                                           \
+  void Set##Field(type__* x) {                                                \
+    field__##_ =                                                              \
+        unique_ptr_t(x, [](void* x) { delete static_cast<type__*>(x); });     \
+    valid_fields_.insert(#field__);                                           \
+  }                                                                           \
+  void Set##Field##NotOwned(type__* x) {                                      \
+    valid_fields_.insert(#field__);                                           \
+    field__##_ = unique_ptr_t(x, [](void* x) {});                             \
+  }                                                                           \
+  DECL_ARGUMENT_FIELD_VALID(field__);                                         \
+  type__* field__##_ptr() {                                                   \
+    PADDLE_ENFORCE_EQ(                                                        \
+        Has(#field__), true,                                                  \
+        platform::errors::PreconditionNotMet("There is no such field"));      \
+    return static_cast<type__*>(field__##_.get());                            \
+  }                                                                           \
+  type__* Release##Field() {                                                  \
+    PADDLE_ENFORCE_EQ(                                                        \
+        Has(#field__), true,                                                  \
+        platform::errors::PreconditionNotMet("There is no such field"));      \
+    valid_fields_.erase(#field__);                                            \
+    return static_cast<type__*>(field__##_.release());                        \
+  }                                                                           \
+                                                                              \
+ private:                                                                     \
   unique_ptr_t field__##_;
 
   DECL_ARGUMENT_FIELD(predictor_id, PredictorID, int);
@@ -218,13 +227,19 @@ struct Argument {
 
   DECL_ARGUMENT_FIELD(fusion_statis, FusionStatis, fusion_statis_t);
 
+  // Only used in paddle-lite subgraph.
+  DECL_ARGUMENT_FIELD(cpu_math_library_num_threads, CpuMathLibraryNumThreads,
+                      int);
+
  private:
   std::unordered_set<std::string> valid_fields_;
 };
 
 #define ARGUMENT_CHECK_FIELD(argument__, fieldname__) \
-  PADDLE_ENFORCE(argument__->Has(#fieldname__),       \
-                 "the argument field [%s] should be set", #fieldname__);
+  PADDLE_ENFORCE_EQ(                                  \
+      argument__->Has(#fieldname__), true,            \
+      platform::errors::PreconditionNotMet(           \
+          "the argument field [%s] should be set", #fieldname__));
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index a48058400241b030f17557156a4d973fca92fd8d..730fe35853a96a3427c26f1fa5662118a638f731 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -73,12 +73,15 @@ struct DataTypeNamer {
   template <typename T>
   const std::string &repr() const {
     auto x = std::type_index(typeid(T));
-    PADDLE_ENFORCE(dic_.count(x), "unknown type for representation");
+    PADDLE_ENFORCE_GT(dic_.count(x), 0, platform::errors::PreconditionNotMet(
+                                            "unknown type for representation"));
     return dic_.at(x);
   }
 
   const std::string &repr(const std::type_index &type) const {  // NOLINT
-    PADDLE_ENFORCE(dic_.count(type), "unknown type for representation");
+    PADDLE_ENFORCE_GT(dic_.count(type), 0,
+                      platform::errors::PreconditionNotMet(
+                          "unknown type for representation"));
     return dic_.at(type);
   }
 
@@ -116,7 +119,9 @@ template <typename T>
 class OrderedRegistry {
  public:
   T *Register(const std::string &name, T *x) {
-    PADDLE_ENFORCE(!dic_.count(name), "duplicate key [%s]", name);
+    PADDLE_ENFORCE_EQ(dic_.count(name), 0,
+                      platform::errors::PreconditionNotMet(
+                          "There exists duplicate key [%s]", name));
     dic_[name] = elements_.size();
     elements_.emplace_back(std::unique_ptr<T>(x));
     return elements_.back().get();
@@ -136,14 +141,20 @@ class OrderedRegistry {
 template <typename T>
 T &GetFromScope(const framework::Scope &scope, const std::string &name) {
   framework::Variable *var = scope.FindVar(name);
-  PADDLE_ENFORCE(var != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::PreconditionNotMet(
+               "The var which name is %s should not be nullptr.", name));
   return *var->GetMutable<T>();
 }
 
 static framework::proto::ProgramDesc LoadProgramDesc(
     const std::string &model_path) {
   std::ifstream fin(model_path, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(fin.is_open(), "Cannot open file %s", model_path);
+  PADDLE_ENFORCE_EQ(
+      fin.is_open(), true,
+      platform::errors::NotFound(
+          "Cannot open file %s, please confirm whether the file exists",
+          model_path));
   fin.seekg(0, std::ios::end);
   std::string buffer(fin.tellg(), ' ');
   fin.seekg(0, std::ios::beg);
@@ -188,10 +199,12 @@ static std::string GetDirRoot(const std::string &path) {
 static std::string GetOrCreateModelOptCacheDir(const std::string &model_root) {
   std::string opt_cache_dir = model_root + "/_opt_cache/";
   if (!PathExists(opt_cache_dir)) {
-    PADDLE_ENFORCE(MKDIR(opt_cache_dir.c_str()) != -1,
-                   "Can not create optimize cache directory: %s, Make sure you "
-                   "have permission to write",
-                   opt_cache_dir);
+    PADDLE_ENFORCE_NE(
+        MKDIR(opt_cache_dir.c_str()), -1,
+        platform::errors::PreconditionNotMet(
+            "Can not create optimize cache directory: %s, Make sure you "
+            "have permission to write",
+            opt_cache_dir));
   }
   return opt_cache_dir;
 }
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index cd8d86d72938417112e17e86e5cc6dd12254a8d1..d136f5033e7e3783ec6c44bbacb94047c718b935 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -38,7 +38,9 @@ IRPassManager::IRPassManager(Argument *argument) {
   graph_ = std::unique_ptr<Graph>(new Graph(argument->main_program()));
   if (argument->Has("scope")) {
     auto *scope_ptr = argument->scope_ptr();
-    PADDLE_ENFORCE(scope_ptr);
+    PADDLE_ENFORCE_NOT_NULL(scope_ptr,
+                            platform::errors::PreconditionNotMet(
+                                "The scope ptr should not be nullptr."));
     graph_->SetNotOwned(framework::ir::kParamScopeAttr, scope_ptr);
   }
 
@@ -101,13 +103,17 @@ void IRPassManager::CreatePasses(Argument *argument,
       std::string optim_cache_dir = argument->optim_cache_dir();
       bool int8_valid =
           !(model_from_memory && optim_cache_dir.empty() && enable_int8);
-      PADDLE_ENFORCE(int8_valid,
-                     "When you are in TRT INT8 mode, and load model from "
-                     "memory, you should set optim_cache_dir using "
-                     "config.SetOptimCacheDir()");
-      PADDLE_ENFORCE(!(model_from_memory && use_static_engine),
-                     "When you are using Paddle-TRT, and also using load model "
-                     "from memory, you should set the use_static to false.");
+      PADDLE_ENFORCE_EQ(
+          int8_valid, true,
+          platform::errors::PreconditionNotMet(
+              "When you are in TRT INT8 mode, and load model from "
+              "memory, you should set optim_cache_dir using "
+              "config.SetOptimCacheDir()"));
+      PADDLE_ENFORCE_EQ(
+          !(model_from_memory && use_static_engine), true,
+          platform::errors::PreconditionNotMet(
+              "When you are using Paddle-TRT, and also using load model "
+              "from memory, you should set the use_static to false."));
 
       if (!optim_cache_dir.empty()) {
         pass->Set("model_opt_cache_dir", new std::string(optim_cache_dir));
@@ -150,6 +156,8 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("use_xpu", new bool(argument->use_xpu()));
       pass->Set("xpu_l3_workspace_size",
                 new int(argument->xpu_l3_workspace_size()));
+      pass->Set("cpu_math_library_num_threads",
+                new int(argument->cpu_math_library_num_threads()));
     }
     disable_logs_ = argument->disable_logs();
     if (pass_name == "fc_fuse_pass") {
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
index 6b16a481ddedbad0956d1358de95842ea9a3a101..e78d5ef017b7f8451556d388bf3b8c0a55276a59 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -244,6 +244,7 @@ void LiteSubgraphPass::SetUpEngine(
   bool enable_int8 = Get<bool>("enable_int8");
   bool use_xpu = Get<bool>("use_xpu");
   int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size");
+  int cpu_math_library_num_threads = Get<int>("cpu_math_library_num_threads");
 
   lite_api::TargetType target_type;
   if (use_gpu) {
@@ -263,11 +264,12 @@ void LiteSubgraphPass::SetUpEngine(
       // Notice: The ordering here determines the device where the
       // input tensor of the Lite engine is located, and then affects
       // whether tensor sharing is feasible.
-      paddle::lite::Place({target_type, precision_type}),
-      paddle::lite::Place({target_type, PRECISION(kInt64)}),
-      paddle::lite::Place({target_type, PRECISION(kFloat)}),
-      paddle::lite::Place({TARGET(kHost), PRECISION(kFloat)}),
+      paddle::lite_api::Place({target_type, precision_type}),
+      paddle::lite_api::Place({target_type, PRECISION(kInt64)}),
+      paddle::lite_api::Place({target_type, PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kHost), PRECISION(kFloat)}),
   };
+  config.cpu_math_library_num_threads = cpu_math_library_num_threads;
   config.xpu_l3_workspace_size = xpu_l3_workspace_size;
   if (dump_model) {
     lite::StrToBinaryFile("./model.bin", config.model);
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
index b3bfafb0a116018fe2d624f390f355b348e3f847..ebb19fd486cc89c69d70de3fa98954b9ee415f1a 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
@@ -123,7 +123,9 @@ void RenameAndGetOutputs(
   auto add_block_var = [&](const std::string &graph_arg,
                            const std::string &block_arg) {
     auto arg_var_node = graph_var_map.find(graph_arg);
-    PADDLE_ENFORCE(arg_var_node != graph_var_map.end());
+    PADDLE_ENFORCE_NE(arg_var_node, graph_var_map.end(),
+                      platform::errors::InvalidArgument(
+                          "Can not find %s in graph_var_map", graph_arg));
     auto *var_t = block_desc->Var(block_arg);
     var_t->SetShape(arg_var_node->second->Var()->GetShape());
     var_t->SetDataType(arg_var_node->second->Var()->GetDataType());
@@ -133,7 +135,10 @@ void RenameAndGetOutputs(
     framework::proto::OpDesc *op = block_desc->Op(index)->Proto();
     framework::OpDesc op_desc(*op, nullptr);
     auto correspond_node = subgraph_nodes[index];
-    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
+    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type(),
+                      platform::errors::PreconditionNotMet(
+                          "We should get %s, but get %s", op->type(),
+                          correspond_node->Name()));
 
     std::unordered_map<std::string, size_t> var2id;
     std::unordered_map<std::string, framework::ir::Node *> in_vars;
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 7ef072277fb7f1f13c14b38d64cea6d1f4584b76..46612c1c5b7065a1f87e09117818df8a15e2bd8b 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -97,7 +97,9 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
     std::vector<std::string> *repetitive_params) const {
   auto *op_desc = node->Op();
   auto &subgraph = *framework::ir::Agent(node).subgraph();
-  PADDLE_ENFORCE(!subgraph.empty());
+  PADDLE_ENFORCE_EQ(subgraph.empty(), false,
+                    platform::errors::PreconditionNotMet(
+                        "The subgraph should not be empty."));
 
   framework::ProgramDesc *program_desc =
       Get<framework::ProgramDesc *>("program");
@@ -194,12 +196,17 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   // to Tensor.
   std::vector<std::string> output_mapping;
   for (auto name : output_names) {
-    PADDLE_ENFORCE(output_name_map.count(name) != 0);
+    PADDLE_ENFORCE_NE(output_name_map.count(name), 0,
+                      platform::errors::PreconditionNotMet(
+                          "The output_name_map should have %s", name));
     output_mapping.push_back(output_name_map[name]);
   }
-  PADDLE_ENFORCE(!output_mapping.empty());
-  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
-                 "the block has no var-desc");
+  PADDLE_ENFORCE_EQ(output_mapping.empty(), false,
+                    platform::errors::PreconditionNotMet(
+                        "The output_mapping should not be empty."));
+  PADDLE_ENFORCE_EQ(
+      !block_desc.Proto()->vars().empty(), true,
+      platform::errors::PreconditionNotMet("the block has no var-desc"));
 
   // Set attrs
   op_desc->SetType("tensorrt_engine");
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
index d986811a827b6ed477b30bc43d26f52a71e8f178..34192965297a6b88c7905a2b1d7b1857d842f06a 100644
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
+#include <memory>
+#include <utility>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
 
@@ -31,7 +33,10 @@ void IrAnalysisPass::RunImpl(Argument* argument) {
   // Apply passes.
   IRPassManager the_ir_manager(argument);
   graph = the_ir_manager.Apply(std::move(graph));
-  PADDLE_ENFORCE_GT(graph->Nodes().size(), 0);
+  PADDLE_ENFORCE_GT(
+      graph->Nodes().size(), 0,
+      platform::errors::PreconditionNotMet(
+          "The graph nodes size should be greater than 0, but got 0"));
   argument->SetMainGraph(graph.release());
   CollectFusionStatis(argument);
 }
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
index 970ecdbbeb0c4c12ce6ba928a74a14ca1ae183ca..188b2ff851d96fa76edd666c696d98ddb1dcb948 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -31,7 +31,9 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
   if (!argument->scope_valid()) {
     argument->SetScope(new framework::Scope);
   }
-  PADDLE_ENFORCE(argument->use_gpu_valid());
+  PADDLE_ENFORCE_EQ(argument->use_gpu_valid(), true,
+                    platform::errors::PreconditionNotMet(
+                        "The use_gpu field should be valid"));
 
   // The load program should run on the same device with the inference program,
   // so that the parameters will on the same device, or they will keep copying
@@ -51,14 +53,17 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
         argument->model_from_memory_valid() && argument->model_from_memory());
     argument->SetMainProgram(program.release());
   } else {
-    PADDLE_THROW(
-        "either model_dir or (program path and parameter path) should be set.");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "either model_dir or (program path and parameter path) should be "
+        "set."));
   }
 
   auto graph = std::unique_ptr<Graph>(new Graph(argument->main_program()));
   argument->SetMainGraph(graph.release());
   auto *scope_ptr = argument->scope_ptr();
-  PADDLE_ENFORCE(scope_ptr);
+  PADDLE_ENFORCE_NOT_NULL(scope_ptr,
+                          platform::errors::PreconditionNotMet(
+                              "The scope ptr should not be nullptr."));
   argument->main_graph().SetNotOwned(framework::ir::kParamScopeAttr, scope_ptr);
 }
 
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc
index 1f888a28da0416b41a87b551208fbe109f54d844..c30aa2a1629c3638b1e7714f7d646c924e7156d7 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc
@@ -31,7 +31,8 @@ void IrInferCleanGraphPass::RunImpl(Argument* argument) {
   std::unordered_set<const framework::ir::Node*> invalid_nodes;
   int valid_op = 0;
   for (auto* node : graph.Nodes()) {
-    PADDLE_ENFORCE_NOT_NULL(node);
+    PADDLE_ENFORCE_NOT_NULL(node, platform::errors::PreconditionNotMet(
+                                      "The node should not be nullptr."));
     if (is_valid_node(node)) {
       invalid_nodes.insert(node);
     } else if (node->IsOp()) {
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index fedee3ff95f0ffe7af730c7113dbe6ea33c118e5..f127478b5f2bf4bbc3157c3d825d9b042275d957 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -23,8 +23,12 @@ namespace inference {
 namespace analysis {
 
 void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
-  PADDLE_ENFORCE(argument->scope_valid());
-  PADDLE_ENFORCE(argument->use_gpu_valid());
+  PADDLE_ENFORCE_EQ(
+      argument->scope_valid(), true,
+      platform::errors::PreconditionNotMet("The scope field should be valid"));
+  PADDLE_ENFORCE_EQ(argument->use_gpu_valid(), true,
+                    platform::errors::PreconditionNotMet(
+                        "The use_gpu field should be valid"));
 
   platform::Place place;
 
@@ -40,7 +44,9 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
 
   LOG(INFO) << "Sync params from CPU to GPU";
 
-  PADDLE_ENFORCE(argument->gpu_device_id_valid());
+  PADDLE_ENFORCE_EQ(argument->gpu_device_id_valid(), true,
+                    platform::errors::PreconditionNotMet(
+                        "The gpu_device_id field should be valid"));
   place = platform::CUDAPlace(argument->gpu_device_id());
 
   auto *scope = argument->scope_ptr();
@@ -56,7 +62,8 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
       continue;
     }
     auto *var = scope->FindLocalVar(var_name);
-    PADDLE_ENFORCE(var != nullptr);
+    PADDLE_ENFORCE_NOT_NULL(var, platform::errors::PreconditionNotMet(
+                                     "The var should not be nullptr"));
     if (var->IsType<framework::LoDTensor>() ||
         var->IsType<framework::Tensor>()) {
       auto *t = var->GetMutable<framework::LoDTensor>();
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
index 9eb8478515727cf04f9d16e9a38a8f4c3ec9c683..f432188131eddc402e696091ab3723697216aadf 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -224,7 +224,9 @@ void UpdateOpDescsByReuse(
 
       // modify the graph
       for (auto input_node : node->inputs) {
-        PADDLE_ENFORCE(input_node->IsVar());
+        PADDLE_ENFORCE_EQ(input_node->IsVar(), true,
+                          platform::errors::PreconditionNotMet(
+                              "The input node should be a variable."));
         std::string input_node_name = input_node->Name();
         if (reuse_table.count(input_node_name) &&
             reuse_table.at(input_node_name) != input_node_name) {
@@ -246,7 +248,9 @@ void UpdateOpDescsByReuse(
 
       // modify the graph
       for (auto out_node : node->outputs) {
-        PADDLE_ENFORCE(out_node->IsVar());
+        PADDLE_ENFORCE_EQ(out_node->IsVar(), true,
+                          platform::errors::PreconditionNotMet(
+                              "The output node should be a variable."));
         std::string out_node_name = out_node->Name();
         if (reuse_table.count(out_node_name) &&
             reuse_table.at(out_node_name) != out_node_name) {
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index fb0ad31a3e612201de32813a65970c73b73b611b..c0d3b14e0e43e10332d18ddd217a8a50245ab5ed 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -53,12 +53,10 @@ if(WITH_TESTING)
     inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_fluid_shared
                         ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
     set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
-    set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST")
   elseif(WIN32)
     inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
                         ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
     set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
-    set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST")
   endif()
 
 endif()
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 9fbc97d55090345af3b3b12bcd138bfaecd346cc..2184574aa1fe3c66728b41f221c1b0bf5fd464e7 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -230,7 +230,8 @@ void AnalysisConfig::EnableMkldnnBfloat16() {
 
 MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const {
   PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_,
-                          "MkldnnQuantizer was not enabled yet.");
+                          platform::errors::PreconditionNotMet(
+                              "MkldnnQuantizer was not enabled yet."));
   return mkldnn_quantizer_config_.get();
 }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 500aa8341d6a61056f6f80f82c6f28bb569eb772..ac914700643af2e7e8eca5dcf0bdf8de88e320d6 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -169,7 +169,8 @@ bool AnalysisPredictor::PrepareScope(
   if (parent_scope) {
     PADDLE_ENFORCE_NOT_NULL(
         parent_scope,
-        "Both program and parent_scope should be set in Clone mode.");
+        platform::errors::PreconditionNotMet(
+            "Both program and parent_scope should be set in Clone mode."));
     scope_ = parent_scope;
     status_is_cloned_ = true;
   } else {
@@ -235,7 +236,9 @@ bool AnalysisPredictor::PrepareExecutor() {
   executor_->Prepare(sub_scope_, *inference_program_, 0,
                      config_.use_feed_fetch_ops_);
 
-  PADDLE_ENFORCE_NOT_NULL(sub_scope_);
+  PADDLE_ENFORCE_NOT_NULL(sub_scope_,
+                          platform::errors::PreconditionNotMet(
+                              "The sub_scope should not be nullptr."));
 
   return true;
 }
@@ -297,7 +300,8 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
   timer.tic();
   // set feed variable
   framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get();
-  PADDLE_ENFORCE_NOT_NULL(scope, "The scope should not be nullptr.");
+  PADDLE_ENFORCE_NOT_NULL(scope, platform::errors::PreconditionNotMet(
+                                     "The scope should not be nullptr."));
   if (!SetFeed(inputs, scope)) {
     LOG(ERROR) << "fail to set feed";
     return false;
@@ -399,7 +403,11 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
   outputs->resize(fetches_.size());
   for (size_t i = 0; i < fetches_.size(); ++i) {
     int idx = BOOST_GET_CONST(int, fetches_[i]->GetAttr("col"));
-    PADDLE_ENFORCE((size_t)idx == i);
+    PADDLE_ENFORCE_EQ(
+        static_cast<size_t>(idx), i,
+        platform::errors::InvalidArgument(
+            "Fetch op's col attr(%d) should be equal to the index(%d)", idx,
+            i));
     framework::FetchType &fetch_var =
         framework::GetFetchVariable(*scope, "fetch", idx);
     auto &fetch = BOOST_GET(framework::LoDTensor, fetch_var);
@@ -435,10 +443,12 @@ void AnalysisPredictor::PrepareArgument() {
   if (!config_.model_dir().empty()) {
     argument_.SetModelDir(config_.model_dir());
   } else {
-    PADDLE_ENFORCE(
-        !config_.params_file().empty(),
-        "Either model_dir or (param_file, prog_file) should be set.");
-    PADDLE_ENFORCE(!config_.prog_file().empty());
+    PADDLE_ENFORCE_EQ(config_.params_file().empty(), false,
+                      platform::errors::PreconditionNotMet(
+                          "Either model_dir or param_file should be set."));
+    PADDLE_ENFORCE_EQ(config_.prog_file().empty(), false,
+                      platform::errors::PreconditionNotMet(
+                          "Either model_dir or prog_file should be set."));
     std::string dir = inference::analysis::GetDirRoot(config_.prog_file());
 
     argument_.SetModelProgramPath(config_.prog_file());
@@ -461,6 +471,8 @@ void AnalysisPredictor::PrepareArgument() {
   }
 
   if (config_.lite_engine_enabled()) {
+    argument_.SetCpuMathLibraryNumThreads(
+        config_.cpu_math_library_num_threads());
     argument_.SetLitePrecisionMode(config_.lite_precision_mode_);
     argument_.SetLitePassesFilter(config_.lite_passes_filter_);
     argument_.SetLiteOpsFilter(config_.lite_ops_filter_);
@@ -501,7 +513,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
   PrepareArgument();
   Analyzer().Run(&argument_);
 
-  PADDLE_ENFORCE(argument_.scope_valid());
+  PADDLE_ENFORCE_EQ(
+      argument_.scope_valid(), true,
+      platform::errors::InvalidArgument("The argument scope should be valid."));
   VLOG(5) << "to prepare executor";
   ARGUMENT_CHECK_FIELD((&argument_), ir_analyzed_program);
   inference_program_.reset(
@@ -523,8 +537,10 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     FLAGS_minloglevel = 2;  // GLOG_ERROR
   }
   VLOG(3) << "create AnalysisConfig";
-  PADDLE_ENFORCE(config.is_valid(),
-                 "Note: Each config can only be used for one predictor.");
+  PADDLE_ENFORCE_EQ(
+      config.is_valid(), true,
+      platform::errors::InvalidArgument(
+          "Note: Each config can only be used for one predictor."));
 
   if (config.use_gpu()) {
     static std::once_flag gflags_initialized;
@@ -621,7 +637,9 @@ bool AnalysisPredictor::MkldnnQuantize() {
 }
 
 void AnalysisPredictor::PrepareFeedFetch() {
-  PADDLE_ENFORCE_NOT_NULL(sub_scope_);
+  PADDLE_ENFORCE_NOT_NULL(sub_scope_,
+                          platform::errors::InvalidArgument(
+                              "The sub_scope should not be nullptr."));
   CreateFeedFetchVar(sub_scope_);
   for (auto *op : inference_program_->Block(0).AllOps()) {
     if (op->Type() == "feed") {
@@ -644,7 +662,8 @@ void AnalysisPredictor::PrepareFeedFetch() {
 }
 
 void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
-  PADDLE_ENFORCE_NOT_NULL(scope);
+  PADDLE_ENFORCE_NOT_NULL(scope, platform::errors::InvalidArgument(
+                                     "The scope should not be nullptr."));
   auto *var = scope->Var("feed");
   var->GetMutable<framework::FeedList>();
   var = scope->Var("fetch");
@@ -665,7 +684,8 @@ AnalysisPredictor::GetInputTensorShape() {
   std::vector<std::string> names = GetInputNames();
   for (std::string name : names) {
     auto *var = inference_program_->Block(0).FindVar(name);
-    PADDLE_ENFORCE_NOT_NULL(var, "input %s does not exist.", name);
+    PADDLE_ENFORCE_NOT_NULL(var, platform::errors::PreconditionNotMet(
+                                     "Input %s does not exist.", name));
     input_shapes[name] = var->GetShape();
   }
   return input_shapes;
@@ -681,7 +701,11 @@ std::vector<std::string> AnalysisPredictor::GetOutputNames() {
 
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
     const std::string &name) {
-  PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
+  PADDLE_ENFORCE_NOT_NULL(
+      executor_->scope()->FindVar(name),
+      platform::errors::PreconditionNotMet(
+          "The variable named %s is not found in the scope of the exector.",
+          name));
   std::unique_ptr<ZeroCopyTensor> res(
       new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
   res->input_or_output_ = true;
@@ -698,7 +722,11 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
 
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
     const std::string &name) {
-  PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
+  PADDLE_ENFORCE_NOT_NULL(
+      executor_->scope()->FindVar(name),
+      platform::errors::PreconditionNotMet(
+          "he variable named %s is not found in the scope of the exector.",
+          name));
   std::unique_ptr<ZeroCopyTensor> res(
       new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
   res->input_or_output_ = false;
@@ -759,8 +787,11 @@ bool AnalysisPredictor::LoadProgramDesc() {
     std::string pb_content;
     // Read binary
     std::ifstream fin(filename, std::ios::in | std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fin.is_open()), "Cannot open file %s",
-                   filename);
+    PADDLE_ENFORCE_EQ(
+        static_cast<bool>(fin.is_open()), true,
+        platform::errors::NotFound(
+            "Cannot open file %s, please confirm whether the file is normal.",
+            filename));
     fin.seekg(0, std::ios::end);
     pb_content.resize(fin.tellg());
     fin.seekg(0, std::ios::beg);
@@ -777,7 +808,8 @@ bool AnalysisPredictor::LoadProgramDesc() {
 
 bool AnalysisPredictor::LoadParameters() {
   PADDLE_ENFORCE_NOT_NULL(inference_program_.get(),
-                          "The inference program should be loaded first.");
+                          platform::errors::PreconditionNotMet(
+                              "The inference program should be loaded first."));
 
   const auto &global_block = inference_program_->MutableBlock(0);
 
@@ -853,8 +885,9 @@ void AnalysisPredictor::ClearIntermediateTensor() {
 
 #if PADDLE_WITH_TENSORRT
 bool AnalysisPredictor::SaveTrtCalibToDisk() {
-  PADDLE_ENFORCE(config_.tensorrt_engine_enabled(),
-                 "This func can be invoked only in trt mode");
+  PADDLE_ENFORCE_EQ(config_.tensorrt_engine_enabled(), true,
+                    platform::errors::PreconditionNotMet(
+                        "This func can be invoked only in trt mode"));
   auto &block = inference_program_->Block(0);
   for (auto &op_desc : block.AllOps()) {
     if (op_desc->Type() == "tensorrt_engine") {
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index 2f608da531f25e1a5665744f7e9a2968cc9d0d64..840541246aff4d6f5dec1d8b3f8e5892bdcb6e9d 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -62,9 +62,9 @@ PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) {
     if (other.length() && other.data())
       memcpy(data_, other.data(), other.length());
     else if (other.length())
-      PADDLE_THROW(
+      PADDLE_THROW(platform::errors::InvalidArgument(
           "Invalid argument, null pointer data with length %u is passed",
-          other.length());
+          other.length()));
 
     length_ = other.length();
     memory_owned_ = true;
@@ -92,7 +92,8 @@ void PaddleBuf::Resize(size_t length) {
     length_ = length;
     memory_owned_ = true;
   } else {
-    PADDLE_THROW("The memory is allocated externally, can not Resized");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "The memory is allocated externally, can not Resized"));
   }
 }
 
@@ -105,7 +106,11 @@ void PaddleBuf::Reset(void *data, size_t length) {
 
 void PaddleBuf::Free() {
   if (memory_owned_ && data_) {
-    PADDLE_ENFORCE_GT(length_, 0UL);
+    PADDLE_ENFORCE_GT(
+        length_, 0UL,
+        platform::errors::PreconditionNotMet(
+            "The memory used in PaddleBuf %d should be greater than 0",
+            length_));
     delete[] static_cast<char *>(data_);
     data_ = nullptr;
     length_ = 0;
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 07d6dcf86e9814e5bfc932d8320b549d55fe88ae..ca0a5148f0622a8c848cb18afb94f600a547bbfe 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -87,7 +87,9 @@ bool NativePaddlePredictor::Init(
   if (parent_scope) {
     scope_ = parent_scope;
     sub_scope_ = &(parent_scope->NewScope());
-    PADDLE_ENFORCE_NOT_NULL(sub_scope_, "create sub scope fail");
+    PADDLE_ENFORCE_NOT_NULL(sub_scope_,
+                            platform::errors::PreconditionNotMet(
+                                "The sub_scope should not be nullptr."));
   } else {
     paddle::framework::InitDevices(false);
     scope_.reset(new paddle::framework::Scope());
@@ -182,7 +184,10 @@ std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
   std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
   // Hot fix the bug that result diff in multi-thread.
   // TODO(Superjomn) re-implement a real clone here.
-  PADDLE_ENFORCE_NOT_NULL(dynamic_cast<NativePaddlePredictor *>(cls.get()));
+  PADDLE_ENFORCE_NOT_NULL(
+      dynamic_cast<NativePaddlePredictor *>(cls.get()),
+      platform::errors::PreconditionNotMet(
+          "Dynamic_cast from PaddlePredictor to NativePaddlePredictor failed"));
   if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init(nullptr)) {
     LOG(ERROR) << "fail to call Init";
     return nullptr;
@@ -224,8 +229,13 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
       return false;
     }
 
-    PADDLE_ENFORCE_NOT_NULL(input_ptr);
-    PADDLE_ENFORCE_NOT_NULL(inputs[i].data.data());
+    PADDLE_ENFORCE_NOT_NULL(input_ptr,
+                            platform::errors::InvalidArgument(
+                                "The input_ptr should not be nullptr."));
+    PADDLE_ENFORCE_NOT_NULL(
+        inputs[i].data.data(),
+        platform::errors::InvalidArgument(
+            "The data of input tensor should not be null."));
     if (platform::is_cpu_place(place_)) {
       // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
       std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
@@ -241,7 +251,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                    platform::CPUPlace(), inputs[i].data.data(),
                    inputs[i].data.length(), dev_ctx->stream());
 #else
-      PADDLE_THROW("Not compile with CUDA, should not reach here.");
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Not compile with CUDA, should not reach here."));
 #endif
     }
 
@@ -287,7 +298,11 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
   outputs->resize(fetchs_.size());
   for (size_t i = 0; i < fetchs_.size(); ++i) {
     int idx = BOOST_GET_CONST(int, fetchs_[i]->GetAttr("col"));
-    PADDLE_ENFORCE((size_t)idx == i);
+    PADDLE_ENFORCE_EQ(
+        static_cast<size_t>(idx), i,
+        platform::errors::InvalidArgument(
+            "Fetch op's col attr(%d) should be equal to the index(%d)", idx,
+            i));
     framework::FetchType &fetch_var =
         framework::GetFetchVariable(*scope, "fetch", idx);
     auto fetch = BOOST_GET_CONST(framework::LoDTensor, fetch_var);
@@ -318,10 +333,15 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
   VLOG(3) << "create NativePaddlePredictor";
   if (config.use_gpu) {
     // 1. GPU memory
-    PADDLE_ENFORCE_GE(
-        config.fraction_of_gpu_memory, 0.f,
-        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
-    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
+    PADDLE_ENFORCE_GE(config.fraction_of_gpu_memory, 0.f,
+                      platform::errors::InvalidArgument(
+                          "fraction_of_gpu_memory in the config should be set "
+                          "to range (0., 1.]"));
+    PADDLE_ENFORCE_GE(config.device, 0,
+                      platform::errors::PreconditionNotMet(
+                          "Invalid device id %d, the device id should be "
+                          "greater than or equal to 0.",
+                          config.device));
     std::vector<std::string> flags;
     if (config.fraction_of_gpu_memory >= 0.0f ||
         config.fraction_of_gpu_memory <= 0.95f) {
@@ -336,7 +356,9 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
 
   std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config));
   PADDLE_ENFORCE_NOT_NULL(
-      dynamic_cast<NativePaddlePredictor *>(predictor.get()));
+      dynamic_cast<NativePaddlePredictor *>(predictor.get()),
+      platform::errors::PreconditionNotMet(
+          "Dynamic_cast from PaddlePredictor to NativePaddlePredictor failed"));
   if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
     return nullptr;
   }
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index a5a0405ac88ad8e94a65d728557ab9298eae56dc..46755eeda660ae8f4c54d318f6450fbf1d48b1f7 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -21,15 +21,21 @@
 namespace paddle {
 
 void ZeroCopyTensor::Reshape(const std::vector<int> &shape) {
-  PADDLE_ENFORCE(!name_.empty(),
-                 "Need to SetName first, so that the corresponding tensor can "
-                 "be retrieved.");
-  PADDLE_ENFORCE(input_or_output_,
-                 "Can't reshape the output tensor, it is readonly");
-  PADDLE_ENFORCE(scope_);
+  PADDLE_ENFORCE_EQ(
+      name_.empty(), false,
+      platform::errors::PreconditionNotMet(
+          "Need to SetName first, so that the corresponding tensor can "
+          "be retrieved."));
+  PADDLE_ENFORCE_EQ(input_or_output_, true,
+                    platform::errors::PermissionDenied(
+                        "Can't reshape the output tensor, it is readonly"));
+  PADDLE_ENFORCE_NOT_NULL(scope_, platform::errors::PreconditionNotMet(
+                                      "The scope should not be nullptr."));
   auto *scope = static_cast<framework::Scope *>(scope_);
   auto *var = scope->FindVar(name_);
-  PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::PreconditionNotMet(
+               "No tensor called [%s] in the runtime scope", name_));
   auto *tensor = var->GetMutable<framework::LoDTensor>();
   tensor->Resize(framework::make_ddim(shape));
 }
@@ -45,8 +51,10 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
   EAGER_GET_TENSOR;
   PADDLE_ENFORCE_GT(
       tensor->numel(), 0,
-      "You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)"
-      "function before retrieving mutable_data from input tensor.");
+      platform::errors::PreconditionNotMet(
+          "You should call ZeroCopyTensor::Reshape(const std::vector<int> "
+          "&shape)"
+          "function before retrieving mutable_data from input tensor."));
   switch (static_cast<int>(place)) {
     case static_cast<int>(PaddlePlace::kCPU): {
       return tensor->mutable_data<T>(platform::CPUPlace());
@@ -55,7 +63,8 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
       return tensor->mutable_data<T>(platform::CUDAPlace(device_));
     }
     default:
-      PADDLE_THROW("Unsupported place: %d", static_cast<int>(place));
+      PADDLE_THROW(platform::errors::Unavailable("Unsupported place: %d",
+                                                 static_cast<int>(place)));
       break;
   }
   return nullptr;
@@ -96,10 +105,11 @@ PaddleDType ZeroCopyTensor::type() const {
 template <typename T>
 void ZeroCopyTensor::copy_from_cpu(const T *data) {
   EAGER_GET_TENSOR;
-  PADDLE_ENFORCE_GE(
-      tensor->numel(), 0,
-      "You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)"
-      "function before copying data from cpu.");
+  PADDLE_ENFORCE_GE(tensor->numel(), 0,
+                    platform::errors::PreconditionNotMet(
+                        "You should call ZeroCopyTensor::Reshape(const "
+                        "std::vector<int> &shape)"
+                        "function before copying data from cpu."));
   size_t ele_size = tensor->numel() * sizeof(T);
 
   if (place_ == PaddlePlace::kCPU) {
@@ -116,7 +126,8 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) {
     memory::Copy(gpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
                  data, ele_size, dev_ctx->stream());
 #else
-    PADDLE_THROW("Not compiled with CUDA, should not reach here.");
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not compiled with CUDA, should not reach here."));
 #endif
   }
 }
@@ -141,7 +152,8 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {
 
     cudaStreamSynchronize(dev_ctx->stream());
 #else
-    PADDLE_THROW("Not compile with CUDA, should not reach here.");
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not compile with CUDA, should not reach here."));
 #endif
   }
 }
@@ -176,20 +188,27 @@ template PD_INFER_DECL uint8_t *ZeroCopyTensor::mutable_data<uint8_t>(
     PaddlePlace place);
 
 void *ZeroCopyTensor::FindTensor() const {
-  PADDLE_ENFORCE(!name_.empty(),
-                 "Need to SetName first, so that the corresponding tensor can "
-                 "be retrieved.");
-  PADDLE_ENFORCE(scope_);
+  PADDLE_ENFORCE_EQ(
+      name_.empty(), false,
+      platform::errors::PreconditionNotMet(
+          "Need to SetName first, so that the corresponding tensor can "
+          "be retrieved."));
+  PADDLE_ENFORCE_NOT_NULL(scope_, platform::errors::PreconditionNotMet(
+                                      "The scope should not be nullptr."));
   auto *scope = static_cast<framework::Scope *>(scope_);
   auto *var = scope->FindVar(name_);
-  PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::PreconditionNotMet(
+               "No tensor called [%s] in the runtime scope", name_));
   auto *tensor = var->GetMutable<framework::LoDTensor>();
   return tensor;
 }
 
 std::vector<int> ZeroCopyTensor::shape() const {
   EAGER_GET_TENSOR;
-  PADDLE_ENFORCE(tensor_, "not found tensor called %s in the scope", name_);
+  PADDLE_ENFORCE_NOT_NULL(
+      tensor_, platform::errors::PreconditionNotMet(
+                   "Not found tensor called %s in the scope", name_));
   return framework::vectorize<int>(tensor->dims());
 }
 
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index cddb0c8daf97b2b8142fcc3b207be2c56a43988a..014985661fd927debb48c699a157c0e05265842c 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -112,16 +112,19 @@ static T convert(const std::string &item,
     std::string message =
         "invalid_argument exception when try to convert : " + item;
     LOG(ERROR) << message;
-    PADDLE_THROW(message);
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "invalid_argument exception when try to convert %s.", item));
   } catch (std::out_of_range &e) {
     std::string message =
         "out_of_range exception when try to convert : " + item;
     LOG(ERROR) << message;
-    PADDLE_THROW(message);
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "out_of_range exception when try to convert %s.", item));
   } catch (...) {
     std::string message = "unexpected exception when try to convert " + item;
     LOG(ERROR) << message;
-    PADDLE_THROW(message);
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "unexpected exception when try to convert %s.", item));
   }
   return res;
 }
@@ -353,7 +356,8 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
                       double batch_latency, int epoch = 1,
                       const framework::proto::VarType::Type data_type =
                           framework::proto::VarType::FP32) {
-  PADDLE_ENFORCE_GT(batch_size, 0, "Non-positive batch size.");
+  PADDLE_ENFORCE_GT(batch_size, 0, platform::errors::InvalidArgument(
+                                       "Non-positive batch size."));
   double sample_latency = batch_latency / batch_size;
   LOG(INFO) << "====== threads: " << num_threads << ", thread id: " << tid
             << " ======";
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
index 9be12ff309acff681da75f7f13e317a408a9552a..793fc53d90b768050572a3dd0a080a5d30e959a2 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -62,9 +62,12 @@ bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
             if (scales_.find(var_name) != scales_.end()) continue;
 
             auto* var = predictor_.sub_scope_->FindVar(var_name);
-            PADDLE_ENFORCE(var, "%s is not in the scope", var_name);
-            PADDLE_ENFORCE(var->IsType<LoDTensor>(),
-                           "Only support lod tensor now.");
+            PADDLE_ENFORCE_NOT_NULL(var,
+                                    platform::errors::PreconditionNotMet(
+                                        "%s is not in the scope", var_name));
+            PADDLE_ENFORCE_EQ(var->IsType<LoDTensor>(), true,
+                              platform::errors::PreconditionNotMet(
+                                  "Only support lod tensor now."));
             LoDTensor* var_tensor = var->GetMutable<LoDTensor>();
 
             // force unsigned type if already know it
@@ -82,9 +85,11 @@ bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
               } else if (op->Type() == "transpose2" ||
                          op->Type() == "reshape2" || op->Type() == "pool2d") {
                 auto input_var_name = op->Input("X")[0];
-                PADDLE_ENFORCE(scales_.find(input_var_name) != scales_.end(),
-                               "Input scales must be calculated before the "
-                               "output scales to infer if output is unsigned.");
+                PADDLE_ENFORCE_NE(
+                    scales_.find(input_var_name), scales_.end(),
+                    platform::errors::PreconditionNotMet(
+                        "Input scales must be calculated before the "
+                        "output scales to infer if output is unsigned."));
                 if (scales_.find(input_var_name) != scales_.end()) {
                   scales_[var_name] = scales_[input_var_name];
                 }
@@ -94,10 +99,11 @@ bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
                 is_unsigned = true;
                 double min_scale = std::numeric_limits<double>::max();
                 for (auto input_var_name : op->Input("X")) {
-                  PADDLE_ENFORCE(
-                      scales_.find(input_var_name) != scales_.end(),
-                      "Input scales must be calculated before the "
-                      "output scales to infer if output is unsigned.");
+                  PADDLE_ENFORCE_NE(
+                      scales_.find(input_var_name), scales_.end(),
+                      platform::errors::PreconditionNotMet(
+                          "Input scales must be calculated before the "
+                          "output scales to infer if output is unsigned."));
                   is_unsigned = is_unsigned && scales_[input_var_name].first;
                   min_scale = std::min(
                       min_scale,
@@ -132,11 +138,12 @@ void AnalysisPredictor::MkldnnQuantizer::CalculateSingleScale(
   auto rule = qconfig_->scale_algo(op_type_name, conn_name);
   if (rule == ScaleAlgo::NONE) return;
 
-  PADDLE_ENFORCE(
-      var_tensor.numel() > 0,
-      "MkldnnQuantizer: LoDTensor of variable %s for quantization of op "
-      "%s of connection %s should not be empty.",
-      var_name, op_type_name, conn_name);
+  PADDLE_ENFORCE_GT(
+      var_tensor.numel(), 0,
+      platform::errors::InvalidArgument(
+          "MkldnnQuantizer: LoDTensor of variable %s for quantization of op "
+          "%s of connection %s should not be empty.",
+          var_name, op_type_name, conn_name));
 
   switch (rule) {
     case ScaleAlgo::MAX:
@@ -205,10 +212,11 @@ AnalysisPredictor::MkldnnQuantizer::GetKLScalingFactor(
   float min_val = eigen_tensor.minCoeff();
   bool is_positive = min_val >= 0.0f;
   if (is_unsigned)
-    PADDLE_ENFORCE(
-        is_positive,
-        "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
-        min_val);
+    PADDLE_ENFORCE_EQ(
+        is_positive, true,
+        platform::errors::InvalidArgument(
+            "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
+            min_val));
 
   int num_quantized_bins = 255;
 
@@ -316,10 +324,11 @@ AnalysisPredictor::MkldnnQuantizer::GetMaxScalingFactor(
   float max_abs = eigen_tensor.abs().maxCoeff();
   float min_val = eigen_tensor.minCoeff();
   if (is_unsigned)
-    PADDLE_ENFORCE(
-        min_val >= 0.0f,
-        "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
-        min_val);
+    PADDLE_ENFORCE_GE(
+        min_val, 0.0f,
+        platform::errors::InvalidArgument(
+            "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
+            min_val));
 
   LoDTensor scale_tensor = CreateScaleTensor();
   scale_tensor.data<double>()[0] = 1.0 / max_abs;
@@ -330,16 +339,19 @@ AnalysisPredictor::MkldnnQuantizer::GetMaxScalingFactor(
 std::pair<bool, LoDTensor>
 AnalysisPredictor::MkldnnQuantizer::GetMaxChScalingFactor(
     const LoDTensor& var_tensor, bool is_unsigned, bool is_transposed) const {
-  PADDLE_ENFORCE(var_tensor.dims().size() > 0, "Tensor dimension is empty.");
+  PADDLE_ENFORCE_GT(
+      var_tensor.dims().size(), 0,
+      platform::errors::InvalidArgument("Tensor dimension is empty."));
 
   ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
                                         var_tensor.numel(), 1};
   float min_val = eigen_tensor.minCoeff();
   if (is_unsigned)
-    PADDLE_ENFORCE(
-        min_val >= 0.0f,
-        "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
-        min_val);
+    PADDLE_ENFORCE_GE(
+        min_val, 0.0f,
+        platform::errors::InvalidArgument(
+            "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
+            min_val));
 
   auto dims = var_tensor.dims();
   constexpr int num_col_dims = 1;
@@ -367,17 +379,19 @@ AnalysisPredictor::MkldnnQuantizer::Histogram(
     const framework::LoDTensor& var_tensor, float min_val, float max_val,
     size_t num_bins) const {
   PADDLE_ENFORCE_GT(num_bins, 0,
-                    "MkldnnQuantizer: To calculate Histogram, num_bins (" +
-                        std::to_string(num_bins) + ") must be positive.");
-  PADDLE_ENFORCE_GT(
-      var_tensor.numel(), 0,
-      "MkldnnQuantizer: To calculate Histogram, the tensor must not be empty.");
-  PADDLE_ENFORCE(max_val >= min_val,
-                 "MkldnnQuantizer: To calculate Histogram, max_val (" +
-                     std::to_string(max_val) +
-                     ") must be greater or equal"
-                     "to min_val (" +
-                     std::to_string(min_val) + ").");
+                    platform::errors::InvalidArgument(
+                        "MkldnnQuantizer: To calculate Histogram, num_bins (" +
+                        std::to_string(num_bins) + ") must be positive."));
+  PADDLE_ENFORCE_GT(var_tensor.numel(), 0,
+                    platform::errors::InvalidArgument(
+                        "MkldnnQuantizer: To calculate Histogram, the tensor "
+                        "must not be empty."));
+  PADDLE_ENFORCE_GE(max_val, min_val,
+                    platform::errors::InvalidArgument(
+                        "MkldnnQuantizer: To calculate Histogram, max_val (" +
+                        std::to_string(max_val) + ") must be greater or equal"
+                                                  "to min_val (" +
+                        std::to_string(min_val) + ")."));
   ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
                                         var_tensor.numel(), 1};
   auto bin_width = std::abs(max_val - min_val) / num_bins;
@@ -407,7 +421,8 @@ void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const {
   auto graph = std::unique_ptr<Graph>(new Graph(arg.main_program()));
   arg.SetMainGraph(graph.release());
   auto* scope_ptr = arg.scope_ptr();
-  PADDLE_ENFORCE(scope_ptr);
+  PADDLE_ENFORCE_NOT_NULL(scope_ptr, platform::errors::PreconditionNotMet(
+                                         "The scope should not be nullptr."));
   arg.main_graph().SetNotOwned(framework::ir::kParamScopeAttr, scope_ptr);
 
   auto* builder = predictor_.config_.pass_builder();
@@ -441,7 +456,9 @@ bool AnalysisPredictor::MkldnnQuantizer::RunQuantizePasses() const {
   PrepareArgument();
   auto& arg = predictor_.argument_;
   Analyzer().Run(&arg);
-  PADDLE_ENFORCE(arg.scope_valid());
+  PADDLE_ENFORCE_EQ(
+      arg.scope_valid(), true,
+      platform::errors::PreconditionNotMet("The scope should be valid."));
   VLOG(5) << "to prepare executor";
   ARGUMENT_CHECK_FIELD((&arg), ir_analyzed_program);
   predictor_.inference_program_.reset(
@@ -456,7 +473,8 @@ bool AnalysisPredictor::MkldnnQuantizer::RunWarmup() const {
   VLOG(3) << "Predictor: run a quantization warmup iteration";
   auto warmup_data = qconfig_->warmup_data();
   PADDLE_ENFORCE_NOT_NULL(warmup_data,
-                          "Warmup data cannot be NULL in the config.");
+                          platform::errors::PreconditionNotMet(
+                              "Warmup data cannot be NULL in the config."));
   PrettyLogH1("--- Running warmup iteration for quantization");
 
   // Run the inference program
@@ -469,7 +487,10 @@ bool AnalysisPredictor::MkldnnQuantizer::RunWarmup() const {
 float AnalysisPredictor::MkldnnQuantizer::SafeEntropy(
     std::vector<int> reference_distr_P, int P_sum,
     std::vector<int> candidate_distr_Q, int Q_sum) const {
-  PADDLE_ENFORCE_EQ(reference_distr_P.size(), candidate_distr_Q.size());
+  PADDLE_ENFORCE_EQ(reference_distr_P.size(), candidate_distr_Q.size(),
+                    platform::errors::InvalidArgument(
+                        "The P size %d should be equal to Q size %d",
+                        reference_distr_P.size(), candidate_distr_Q.size()));
   float tmp_sum1 = 0;
   float tmp_sum2 = 0;
   for (size_t idx = 0; idx < reference_distr_P.size(); idx++) {
@@ -479,10 +500,11 @@ float AnalysisPredictor::MkldnnQuantizer::SafeEntropy(
       tmp_sum1 += 0;
       tmp_sum2 += 0;
     } else {
-      PADDLE_ENFORCE(q_idx != 0, "MkldnnQuantizer: Fatal error!, idx = " +
-                                     std::to_string(idx) +
-                                     " qindex = 0! p_idx = " +
-                                     std::to_string(p_idx));
+      PADDLE_ENFORCE_NE(
+          q_idx, 0,
+          platform::errors::PreconditionNotMet(
+              "MkldnnQuantizer: Fatal error!, idx = " + std::to_string(idx) +
+              " qindex = 0! p_idx = " + std::to_string(p_idx)));
     }
     tmp_sum1 += p_idx * (log(Q_sum * p_idx));
     tmp_sum2 += p_idx * (log(P_sum * q_idx));
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index da5d7411693c92eaa2066c7f76d56970f8939bc7..5dc4430fde4715fe11c19ce8adc7397f77391fc3 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -31,12 +31,30 @@ limitations under the License. */
 #include "paddle_analysis_config.h"  // NOLINT
 #include "paddle_api.h"              // NOLINT
 
+///
+/// \file paddle_inference_api.h
+///
+/// \brief Paddle Inference API
+///
+/// \author paddle-infer@baidu.com
+/// \date 2020-09-01
+/// \since 2.0.0-beta
+///
+
 namespace paddle_infer {
 using DataType = paddle::PaddleDType;
 using PlaceType = paddle::PaddlePlace;
 using PrecisionType = paddle::AnalysisConfig::Precision;
 using Config = paddle::AnalysisConfig;
 
+///
+/// \class Tensor
+///
+/// \brief Represents an n-dimensional array of values.
+/// The Tensor is used to store the input or output of the network.
+/// It is obtained through Predictor::GetinputHandle()
+/// and Predictor::GetOutputHandle() interface.
+///
 class PD_INFER_DECL Tensor {
  public:
   // Can only be created by predictor->GetInputHandle(cosnt std::string& name)
@@ -44,60 +62,186 @@ class PD_INFER_DECL Tensor {
   Tensor() = delete;
   explicit Tensor(std::unique_ptr<paddle::ZeroCopyTensor>&& tensor)
       : tensor_(std::move(tensor)) {}
+
+  ///
+  /// \brief Reset the shape of the tensor.
+  /// Generally it's only used for the input tensor.
+  /// Reshape must be called before calling mutable_data() or CopyFromCpu()
+  /// \param shape The shape to set.
+  ///
   void Reshape(const std::vector<int>& shape);
 
+  ///
+  /// \brief Copy the host memory to tensor data.
+  /// It's usually used to set the input tensor data.
+  /// \param data The pointer of the data, from which the tensor will copy.
+  ///
   template <typename T>
   void CopyFromCpu(const T* data);
 
-  // should add the place
+  ///
+  /// \brief Get the memory pointer in CPU or GPU with specific data type.
+  /// Please Reshape the tensor first before call this.
+  /// It's usually used to get input data pointer.
+  /// \param place The place of the tensor.
+  /// \return The tensor data buffer pointer.
+  ///
   template <typename T>
   T* mutable_data(PlaceType place);
 
+  ///
+  /// \brief Copy the tensor data to the host memory.
+  /// It's usually used to get the output tensor data.
+  /// \param[out] data The tensor will copy the data to the address.
+  ///
   template <typename T>
   void CopyToCpu(T* data);
 
+  ///
+  /// \brief Get the memory pointer directly.
+  /// It's usually used to get the output data pointer.
+  /// \param[out] place To get the device type of the tensor.
+  /// \param[out] size To get the data size of the tensor.
+  /// \return The tensor data buffer pointer.
+  ///
   template <typename T>
   T* data(PlaceType* place, int* size) const;
 
+  ///
+  /// \brief Set lod info of the tensor.
+  /// More about LOD can be seen here:
+  ///  https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/basic_concept/lod_tensor.html#lodtensor
+  /// \param x the lod info.
+  ///
   void SetLoD(const std::vector<std::vector<size_t>>& x);
+
+  /// \brief Return the lod info of the tensor.
   std::vector<std::vector<size_t>> lod() const;
 
+  /// \brief Return the data type of the tensor.
+  /// It's usually used to get the output tensor data type.
+  /// \return The data type of the tensor.
   DataType type() const;
 
+  /// \brief Return the shape of the Tensor.
   std::vector<int> shape() const;
+
+  /// \brief Return the name of the tensor.
   const std::string& name() const;
 
  private:
   std::unique_ptr<paddle::ZeroCopyTensor> tensor_;
 };
 
+///
+/// \class Predictor
+///
+/// \brief Predictor is the interface for model prediction.
+///
+/// The predictor has the following typical uses:
+///
+/// Get predictor
+/// \code{cpp}
+///   auto predictor = CreatePredictor(config);
+/// \endcode
+///
+/// Get input or output names
+/// \code{cpp}
+///   auto input_names = predictor->GetInputNames();
+///   auto output_names = predictor->GetOutputNames();
+/// \endcode
+///
+/// Get input or output handle
+/// \code{cpp}
+///   auto input_t = predictor->GetInputHandle(input_names[0]);
+///   auto output_t = predictor->GetOutputHandle(output_names[0]);
+/// \endcode
+///
+/// Run predictor
+/// \code{cpp}
+///   predictor->Run();
+/// \endcode
+///
 class PD_INFER_DECL Predictor {
  public:
-  Predictor() = default;
+  Predictor() = delete;
   ~Predictor() {}
   // Use for clone
   explicit Predictor(std::unique_ptr<paddle::PaddlePredictor>&& pred)
       : predictor_(std::move(pred)) {}
 
+  ///
+  /// \brief Construct a new Predictor object
+  ///
+  /// \param[in] Config config
+  ///
   explicit Predictor(const Config& config);
 
+  ///
+  /// \brief Get the input names
+  ///
+  /// \return input names
+  ///
   std::vector<std::string> GetInputNames();
+
+  ///
+  /// \brief Get the Input Tensor object
+  ///
+  /// \param[in] name input name
+  /// \return input tensor
+  ///
   std::unique_ptr<Tensor> GetInputHandle(const std::string& name);
 
+  ///
+  /// \brief Run the prediction engine
+  ///
+  /// \return Whether the function executed successfully
+  ///
   bool Run();
 
+  ///
+  /// \brief Get the output names
+  ///
+  /// \return output names
+  ///
   std::vector<std::string> GetOutputNames();
+
+  ///
+  /// \brief Get the Output Tensor object
+  ///
+  /// \param[in] name otuput name
+  /// \return output tensor
+  ///
   std::unique_ptr<Tensor> GetOutputHandle(const std::string& name);
 
+  ///
+  /// \brief Clone to get the new predictor. thread safe.
+  ///
+  /// \return get a new predictor
+  ///
   std::unique_ptr<Predictor> Clone();
+
+  /// \brief Clear the intermediate tensors of the predictor
   void ClearIntermediateTensor();
 
  private:
   std::unique_ptr<paddle::PaddlePredictor> predictor_;
 };
 
+///
+/// \brief A factory to help create predictors.
+///
+/// Usage:
+///
+/// \code{.cpp}
+/// Config config;
+/// ... // change the configs.
+/// auto predictor = CreatePredictor(config);
+/// \endcode
+///
 PD_INFER_DECL std::shared_ptr<Predictor> CreatePredictor(
     const Config& config);  // NOLINT
+
 PD_INFER_DECL int GetNumBytesOfDataType(DataType dtype);
 
 PD_INFER_DECL std::string GetVersion();
@@ -128,13 +272,24 @@ T* Tensor::data(PlaceType* place, int* size) const {
 namespace paddle_infer {
 namespace services {
 
+///
+/// \class PredictorPool
+///
+/// \brief PredictorPool is a simple encapsulation of Predictor, suitable for
+/// use in multi-threaded situations. According to the thread id, the
+/// corresponding Predictor is taken out from PredictorPool to complete the
+/// prediction.
+///
 class PD_INFER_DECL PredictorPool {
  public:
   PredictorPool() = delete;
   PredictorPool(const PredictorPool&) = delete;
   PredictorPool& operator=(const PredictorPool&) = delete;
 
+  /// \brief Construct the predictor pool with \param size predictor instances.
   explicit PredictorPool(const Config& config, size_t size = 1);
+
+  /// \brief Get \param id-th predictor.
   Predictor* Retrive(size_t idx);
 
  private:
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 98a36a3308dc539ee5aecad9e71f50be310e584c..c19e77d2714bcfc18c2cf2a98511d31a97295daa 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -231,6 +231,10 @@ void CpuPassStrategy::EnableMkldnnQuantizer() {
 
 void CpuPassStrategy::EnableMkldnnBfloat16() {
 #ifdef PADDLE_WITH_MKLDNN
+  if (!use_mkldnn_bfloat16_) {
+    passes_.push_back("cpu_bfloat16_placement_pass");
+    passes_.push_back("cpu_bfloat16_pass");
+  }
   use_mkldnn_bfloat16_ = true;
 #else
   use_mkldnn_bfloat16_ = false;
diff --git a/paddle/fluid/inference/capi/c_api.cc b/paddle/fluid/inference/capi/c_api.cc
index 821dff2f036c1892570a8ade5b40363251c7f531..07493c742c4fa906e7c4817e328e7d4f81afbffa 100644
--- a/paddle/fluid/inference/capi/c_api.cc
+++ b/paddle/fluid/inference/capi/c_api.cc
@@ -16,6 +16,7 @@
 #include <vector>
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
+#include "paddle/fluid/platform/enforce.h"
 
 using paddle::ConvertToACPrecision;
 using paddle::ConvertToPaddleDType;
@@ -34,27 +35,37 @@ void PD_DeletePaddleBuf(PD_PaddleBuf* buf) {
 }
 
 void PD_PaddleBufResize(PD_PaddleBuf* buf, size_t length) {
-  PADDLE_ENFORCE_NOT_NULL(buf);
+  PADDLE_ENFORCE_NOT_NULL(buf,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of Buffer shouldn't be nullptr"));
   buf->buf.Resize(length);
 }
 
 void PD_PaddleBufReset(PD_PaddleBuf* buf, void* data, size_t length) {
-  PADDLE_ENFORCE_NOT_NULL(buf);
+  PADDLE_ENFORCE_NOT_NULL(buf,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of Buffer shouldn't be nullptr"));
   buf->buf.Reset(data, length);
 }
 
 bool PD_PaddleBufEmpty(PD_PaddleBuf* buf) {
-  PADDLE_ENFORCE_NOT_NULL(buf);
+  PADDLE_ENFORCE_NOT_NULL(buf,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of Buffer shouldn't be nullptr"));
   return buf->buf.empty();
 }
 
 void* PD_PaddleBufData(PD_PaddleBuf* buf) {
-  PADDLE_ENFORCE_NOT_NULL(buf);
+  PADDLE_ENFORCE_NOT_NULL(buf,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of Buffer shouldn't be nullptr"));
   return buf->buf.data();
 }
 
 size_t PD_PaddleBufLength(PD_PaddleBuf* buf) {
-  PADDLE_ENFORCE_NOT_NULL(buf);
+  PADDLE_ENFORCE_NOT_NULL(buf,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of Buffer shouldn't be nullptr"));
   return buf->buf.length();
 }
 
diff --git a/paddle/fluid/inference/capi/c_api_internal.h b/paddle/fluid/inference/capi/c_api_internal.h
index 2dd827229779d34384df2b3ba5f398c77db8369a..7e69b7210768e5af9e8f4150883a608a1517a13c 100644
--- a/paddle/fluid/inference/capi/c_api_internal.h
+++ b/paddle/fluid/inference/capi/c_api_internal.h
@@ -18,7 +18,6 @@
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/api/paddle_api.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
-#include "paddle/fluid/platform/enforce.h"
 
 using PD_PaddleDType = paddle::PaddleDType;
 using PD_ACPrecision = paddle::AnalysisConfig::Precision;
diff --git a/paddle/fluid/inference/capi/pd_config.cc b/paddle/fluid/inference/capi/pd_config.cc
index b99abc06b27ecb9686b4c6e883aaaf8b3e592415..af8d4a69ecf24862ca5f282655b72ef37307c1c8 100644
--- a/paddle/fluid/inference/capi/pd_config.cc
+++ b/paddle/fluid/inference/capi/pd_config.cc
@@ -20,6 +20,7 @@
 #include <vector>
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
+#include "paddle/fluid/platform/enforce.h"
 
 using paddle::ConvertToACPrecision;
 using paddle::ConvertToPaddleDType;
@@ -40,7 +41,10 @@ void PD_DeleteAnalysisConfig(PD_AnalysisConfig* config) {
 void PD_SetModel(PD_AnalysisConfig* config, const char* model_dir,
                  const char* params_path) {
   LOG(INFO) << model_dir;
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   LOG(INFO) << std::string(model_dir);
   if (!params_path) {
     config->config.SetModel(std::string(model_dir));
@@ -50,104 +54,164 @@ void PD_SetModel(PD_AnalysisConfig* config, const char* model_dir,
 }
 
 void PD_SetProgFile(PD_AnalysisConfig* config, const char* x) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SetProgFile(std::string(x));
 }
 
 void PD_SetParamsFile(PD_AnalysisConfig* config, const char* x) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SetParamsFile(std::string(x));
 }
 
 void PD_SetOptimCacheDir(PD_AnalysisConfig* config, const char* opt_cache_dir) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SetOptimCacheDir(std::string(opt_cache_dir));
 }
 
 const char* PD_ModelDir(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.model_dir().c_str();
 }
 
 const char* PD_ProgFile(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.prog_file().c_str();
 }
 
 const char* PD_ParamsFile(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.params_file().c_str();
 }
 
 void PD_EnableUseGpu(PD_AnalysisConfig* config, int memory_pool_init_size_mb,
                      int device_id) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.EnableUseGpu(static_cast<uint64_t>(memory_pool_init_size_mb),
                               device_id);
 }
 
 void PD_DisableGpu(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.DisableGpu();
 }
 
 bool PD_UseGpu(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.use_gpu();
 }
 
 int PD_GpuDeviceId(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.gpu_device_id();
 }
 
 int PD_MemoryPoolInitSizeMb(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.memory_pool_init_size_mb();
 }
 
 float PD_FractionOfGpuMemoryForPool(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.fraction_of_gpu_memory_for_pool();
 }
 
 void PD_EnableCUDNN(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.EnableCUDNN();
 }
 
 bool PD_CudnnEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.cudnn_enabled();
 }
 
 void PD_SwitchIrOptim(PD_AnalysisConfig* config, bool x) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SwitchIrOptim(x);
 }
 
 bool PD_IrOptim(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.ir_optim();
 }
 
 void PD_SwitchUseFeedFetchOps(PD_AnalysisConfig* config, bool x) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SwitchUseFeedFetchOps(x);
 }
 
 bool PD_UseFeedFetchOpsEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.use_feed_fetch_ops_enabled();
 }
 
 void PD_SwitchSpecifyInputNames(PD_AnalysisConfig* config, bool x) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SwitchSpecifyInputNames(x);
 }
 
 bool PD_SpecifyInputName(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.specify_input_name();
 }
 
@@ -155,110 +219,168 @@ void PD_EnableTensorRtEngine(PD_AnalysisConfig* config, int workspace_size,
                              int max_batch_size, int min_subgraph_size,
                              Precision precision, bool use_static,
                              bool use_calib_mode) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.EnableTensorRtEngine(
       workspace_size, max_batch_size, min_subgraph_size,
       paddle::ConvertToACPrecision(precision), use_static, use_calib_mode);
 }
 
 bool PD_TensorrtEngineEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.tensorrt_engine_enabled();
 }
 
 void PD_SwitchIrDebug(PD_AnalysisConfig* config, bool x) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SwitchIrDebug(x);
 }
 
 void PD_EnableMKLDNN(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.EnableMKLDNN();
 }
 
 void PD_SetMkldnnCacheCapacity(PD_AnalysisConfig* config, int capacity) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SetMkldnnCacheCapacity(capacity);
 }
 
 bool PD_MkldnnEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.mkldnn_enabled();
 }
 
 void PD_SetCpuMathLibraryNumThreads(PD_AnalysisConfig* config,
                                     int cpu_math_library_num_threads) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SetCpuMathLibraryNumThreads(cpu_math_library_num_threads);
 }
 
 int PD_CpuMathLibraryNumThreads(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.cpu_math_library_num_threads();
 }
 
 void PD_EnableMkldnnQuantizer(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.EnableMkldnnQuantizer();
 }
 
 bool PD_MkldnnQuantizerEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.mkldnn_quantizer_enabled();
 }
 
 void PD_EnableMkldnnBfloat16(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound(
-                                      "PD_AnalysisConfig should not be null"));
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.EnableMkldnnBfloat16();
 }
 
 bool PD_MkldnnBfloat16Enabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound(
-                                      "PD_AnalysisConfig should not be null"));
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.mkldnn_bfloat16_enabled();
 }
 
 void PD_SetModelBuffer(PD_AnalysisConfig* config, const char* prog_buffer,
                        size_t prog_buffer_size, const char* params_buffer,
                        size_t params_buffer_size) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SetModelBuffer(prog_buffer, prog_buffer_size, params_buffer,
                                 params_buffer_size);
 }
 
 bool PD_ModelFromMemory(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.model_from_memory();
 }
 
 void PD_EnableMemoryOptim(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.EnableMemoryOptim();
 }
 
 bool PD_MemoryOptimEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.enable_memory_optim();
 }
 
 void PD_EnableProfile(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.EnableProfile();
 }
 
 bool PD_ProfileEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.profile_enabled();
 }
 
 void PD_SetInValid(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SetInValid();
 }
 
 bool PD_IsValid(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.is_valid();
 }
 
diff --git a/paddle/fluid/inference/capi/pd_predictor.cc b/paddle/fluid/inference/capi/pd_predictor.cc
index 8aa1e2a7b7f9b99a1636ca2e7396089ab2ae7e15..0509a6190211c25b6461c1d683daa6b33110b4e0 100644
--- a/paddle/fluid/inference/capi/pd_predictor.cc
+++ b/paddle/fluid/inference/capi/pd_predictor.cc
@@ -22,6 +22,7 @@
 #include "paddle/fluid/inference/api/paddle_api.h"
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
+#include "paddle/fluid/platform/enforce.h"
 
 using paddle::ConvertToACPrecision;
 using paddle::ConvertToPaddleDType;
@@ -81,7 +82,10 @@ extern "C" {
 bool PD_PredictorRun(const PD_AnalysisConfig* config, PD_Tensor* inputs,
                      int in_size, PD_Tensor** output_data, int* out_size,
                      int batch_size) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   VLOG(3) << "Predoctor: PD_PredictorRun. ";
   static std::map<std::string, std::unique_ptr<paddle::PaddlePredictor>>
       predictors;
@@ -111,7 +115,10 @@ bool PD_PredictorRun(const PD_AnalysisConfig* config, PD_Tensor* inputs,
 bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config,
                              PD_ZeroCopyData* inputs, int in_size,
                              PD_ZeroCopyData** output, int* out_size) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   static std::map<std::string, std::unique_ptr<paddle::PaddlePredictor>>
       predictors;
   if (!predictors.count(config->config.model_dir())) {
@@ -144,7 +151,8 @@ bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config,
         input_t->copy_from_cpu(static_cast<uint8_t*>(inputs[i].data));
         break;
       default:
-        CHECK(false) << "Unsupport data type.";
+        PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+            "Unsupported data type."));
         break;
     }
   }
@@ -227,7 +235,8 @@ void PD_SetZeroCopyInput(PD_Predictor* predictor,
       input->copy_from_cpu(static_cast<uint8_t*>(tensor->data.data));
       break;
     default:
-      CHECK(false) << "Unsupport data type.";
+      PADDLE_THROW(
+          paddle::platform::errors::InvalidArgument("Unsupported data type."));
       break;
   }
 
@@ -294,7 +303,8 @@ void PD_GetZeroCopyOutput(PD_Predictor* predictor, PD_ZeroCopyTensor* tensor) {
       output->copy_to_cpu(reinterpret_cast<uint8_t*>(tensor->data.data));
       break;
     default:
-      CHECK(false) << "Unsupport data type.";
+      PADDLE_THROW(
+          paddle::platform::errors::InvalidArgument("Unsupported data type."));
       break;
   }
 }
diff --git a/paddle/fluid/inference/capi/pd_tensor.cc b/paddle/fluid/inference/capi/pd_tensor.cc
index b4811f1d6ff192659fa12b33008fe5ac07e6a6c5..9b1eedd7c5a8106a6f6b7be3f682913e2431a3e5 100644
--- a/paddle/fluid/inference/capi/pd_tensor.cc
+++ b/paddle/fluid/inference/capi/pd_tensor.cc
@@ -19,6 +19,7 @@
 #include <vector>
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
+#include "paddle/fluid/platform/enforce.h"
 
 using paddle::ConvertToACPrecision;
 using paddle::ConvertToPaddleDType;
@@ -37,44 +38,60 @@ void PD_DeletePaddleTensor(PD_Tensor* tensor) {
 }
 
 void PD_SetPaddleTensorName(PD_Tensor* tensor, char* name) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
   tensor->tensor.name = std::string(name);
 }
 
 void PD_SetPaddleTensorDType(PD_Tensor* tensor, PD_DataType dtype) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
   tensor->tensor.dtype = paddle::ConvertToPaddleDType(dtype);
 }
 
 void PD_SetPaddleTensorData(PD_Tensor* tensor, PD_PaddleBuf* buf) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
   tensor->tensor.data = buf->buf;
 }
 
 void PD_SetPaddleTensorShape(PD_Tensor* tensor, int* shape, int size) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
   tensor->tensor.shape.assign(shape, shape + size);
 }
 
 const char* PD_GetPaddleTensorName(const PD_Tensor* tensor) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
   return tensor->tensor.name.c_str();
 }
 
 PD_DataType PD_GetPaddleTensorDType(const PD_Tensor* tensor) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
   return ConvertToPDDataType(tensor->tensor.dtype);
 }
 
 PD_PaddleBuf* PD_GetPaddleTensorData(const PD_Tensor* tensor) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
   PD_PaddleBuf* ret = PD_NewPaddleBuf();
   ret->buf = tensor->tensor.data;
   return ret;
 }
 
 const int* PD_GetPaddleTensorShape(const PD_Tensor* tensor, int* size) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
   const std::vector<int>& shape = tensor->tensor.shape;
   *size = shape.size();
   return shape.data();
diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc
index 8e88c94493952ff257ef69bf73f8edebb6ba2eee..5f24ef00bce59e5886d8448cf3f8356e9aeba481 100644
--- a/paddle/fluid/inference/lite/engine.cc
+++ b/paddle/fluid/inference/lite/engine.cc
@@ -20,8 +20,12 @@
 #define LITE_WITH_XPU 1
 #endif
 
+#ifndef PADDLE_WITH_ARM
+#define LITE_WITH_X86 1
+#endif
+
 #include "paddle/fluid/inference/lite/engine.h"
-#include "lite/api/paddle_use_passes.h"
+#include <utility>
 
 namespace paddle {
 namespace inference {
@@ -36,32 +40,40 @@ bool EngineManager::Has(const std::string& name) const {
   return engines_.at(name).get() != nullptr;
 }
 
-paddle::lite::Predictor* EngineManager::Get(const std::string& name) const {
+paddle::lite_api::PaddlePredictor* EngineManager::Get(
+    const std::string& name) const {
   return engines_.at(name).get();
 }
 
-paddle::lite::Predictor* EngineManager::Create(const std::string& name,
-                                               const EngineConfig& cfg) {
-  if (cfg.valid_places.front().target == TARGET(kCUDA)) {
-#ifdef PADDLE_WITH_CUDA
-    paddle::lite::Env<TARGET(kCUDA)>::Init();
+paddle::lite_api::PaddlePredictor* EngineManager::Create(
+    const std::string& name, const EngineConfig& cfg) {
+  // config info for predictor.
+  paddle::lite_api::CxxConfig lite_cxx_config;
+  lite_cxx_config.set_model_buffer(cfg.model.c_str(), cfg.model.size(),
+                                   cfg.param.c_str(), cfg.param.size());
+  lite_cxx_config.set_valid_places(cfg.valid_places);
+#ifdef PADDLE_WITH_ARM
+  set_threads.set_threads(cfg.cpu_math_library_num_threads);
+#else
+  lite_cxx_config.set_x86_math_library_num_threads(
+      cfg.cpu_math_library_num_threads);
 #endif
-  } else if (cfg.valid_places.front().target == TARGET(kXPU)) {
+
 #ifdef PADDLE_WITH_XPU
-    paddle::lite::TargetWrapper<TARGET(kXPU)>::workspace_l3_size_per_thread =
-        cfg.xpu_l3_workspace_size;
+  lite_cxx_config.set_xpu_workspace_l3_size_per_thread(
+      cfg.xpu_l3_workspace_size);
 #endif
-  }
-  auto* p = new paddle::lite::Predictor();
-  p->Build("", cfg.model, cfg.param, cfg.valid_places, cfg.neglected_passes,
-           cfg.model_type, cfg.model_from_memory);
-  engines_[name].reset(p);
-  return p;
+
+  // create predictor
+  std::shared_ptr<paddle::lite_api::PaddlePredictor> p =
+      paddle::lite_api::CreatePaddlePredictor(lite_cxx_config);
+  engines_[name] = std::move(p);
+  return engines_[name].get();
 }
 
 void EngineManager::DeleteAll() {
   for (auto& item : engines_) {
-    item.second.reset(nullptr);
+    item.second.reset();
   }
 }
 
diff --git a/paddle/fluid/inference/lite/engine.h b/paddle/fluid/inference/lite/engine.h
index 345eb682e9fe81d4ec67a31082c1d347a694fd96..5ba487cc24d7d58cd87853a58fc12f1a82c3610d 100644
--- a/paddle/fluid/inference/lite/engine.h
+++ b/paddle/fluid/inference/lite/engine.h
@@ -23,12 +23,9 @@
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wall"
 #include "lite/api/cxx_api.h"
+#include "lite/api/paddle_api.h"
 #include "lite/api/paddle_place.h"
-#include "lite/core/context.h"
-#include "lite/core/device_info.h"
-#include "lite/core/memory.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
+#include "lite/api/paddle_use_passes.h"
 #pragma GCC diagnostic pop
 
 namespace paddle {
@@ -38,25 +35,33 @@ namespace lite {
 struct EngineConfig {
   std::string model;
   std::string param;
-  paddle::lite::Place prefer_place;
-  std::vector<paddle::lite::Place> valid_places;
+  std::vector<paddle::lite_api::Place> valid_places;
   std::vector<std::string> neglected_passes;
   lite_api::LiteModelType model_type{lite_api::LiteModelType::kProtobuf};
   bool model_from_memory{true};
+
+  // for xpu
   size_t xpu_l3_workspace_size;
+
+  // for x86 or arm
+  int cpu_math_library_num_threads{1};
+
+  // for cuda
+  bool use_multi_stream{false};
 };
 
 class EngineManager {
  public:
   bool Empty() const;
   bool Has(const std::string& name) const;
-  paddle::lite::Predictor* Get(const std::string& name) const;
-  paddle::lite::Predictor* Create(const std::string& name,
-                                  const EngineConfig& cfg);
+  paddle::lite_api::PaddlePredictor* Get(const std::string& name) const;
+  paddle::lite_api::PaddlePredictor* Create(const std::string& name,
+                                            const EngineConfig& cfg);
   void DeleteAll();
 
  private:
-  std::unordered_map<std::string, std::unique_ptr<paddle::lite::Predictor>>
+  std::unordered_map<std::string,
+                     std::shared_ptr<paddle::lite_api::PaddlePredictor>>
       engines_;
 };
 
diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
index d79a041ccf8a1611247b65b034c03940eabfcccd..33661594b926f284052c85c6a816a17dfff1ce20 100644
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/lite/tensor_utils.h"
+#include <functional>
 #include <map>
 #include <memory>
 #include "paddle/fluid/framework/data_type.h"
@@ -144,16 +145,55 @@ void MemoryCopyAsync(const platform::Place& dst_place, void* dst_data,
   }
 }
 
-void InitDstTensor(paddle::lite::Tensor* dst, const framework::LoDTensor& src) {
+void* GetLiteTensorDataPtr(paddle::lite_api::Tensor* src,
+                           PrecisionType precision_type,
+                           TargetType target_type) {
+  void* res{nullptr};
+  switch (precision_type) {
+    case PrecisionType::kFloat:
+      res = static_cast<void*>(src->mutable_data<float>(target_type));
+      break;
+    case PrecisionType::kInt8:
+      res = static_cast<void*>(src->mutable_data<int8_t>(target_type));
+      break;
+    case PrecisionType::kInt32:
+      res = static_cast<void*>(src->mutable_data<int32_t>(target_type));
+      break;
+    case PrecisionType::kInt64:
+      res = static_cast<void*>(src->mutable_data<int64_t>(target_type));
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported precision type. Now only supports FP32, INT8, INT32 and "
+          "INT64."));
+      break;
+  }
+  return res;
+}
+
+int64_t GetLiteTensorNumel(const paddle::lite_api::Tensor& tensor) {
+  auto shape = tensor.shape();
+  int64_t numel = std::accumulate(shape.begin(), shape.end(), 1,
+                                  std::multiplies<int64_t>());
+  return numel;
+}
+
+void InitDstTensor(paddle::lite_api::Tensor* dst,
+                   const framework::LoDTensor& src) {
   // Currently, Lite needs to explicitly specify the target type of
   // the input tensor.
   constexpr int empty_size = 0;
-  dst->mutable_data(GetLiteTargetType(src.place()), empty_size);
-  dst->set_precision(GetLitePrecisionType(src.type()));
-  SetLoD(dst->mutable_lod(), src.lod());
+  dst->Resize({empty_size});
+  GetLiteTensorDataPtr(dst, GetLitePrecisionType(src.type()),
+                       GetLiteTargetType(src.place()));
+  dst->SetPrecision(GetLitePrecisionType(src.type()));
+  paddle::lite::LoD lite_lod;
+  SetLoD(&lite_lod, src.lod());
+  dst->SetLoD(lite_lod);
 }
 
-void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) {
+void InitDstTensor(framework::LoDTensor* dst,
+                   const paddle::lite_api::Tensor& src) {
   constexpr framework::proto::VarType::Type dtype =
       framework::proto::VarType_Type_FP32;
   dst->mutable_data(inference::lite::utils::GetNativePlace(src.target()),
@@ -162,7 +202,8 @@ void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) {
 }
 
 template <>
-void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src,
+void TensorCopyAsync(paddle::lite_api::Tensor* dst,
+                     const framework::LoDTensor& src,
                      const platform::DeviceContext& ctx) {
   InitDstTensor(dst, src);
   const platform::Place& src_place = src.place();
@@ -171,52 +212,56 @@ void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src,
       static_cast<size_t>(src.numel()) * framework::SizeOfType(src.type());
   dst->Resize(framework::vectorize(src.dims()));
   const void* src_data = src.data<void>();
-  void* dst_data = dst->mutable_data(bytes);
+  void* dst_data{nullptr};
+  dst_data = GetLiteTensorDataPtr(dst, GetLitePrecisionType(src.type()),
+                                  GetLiteTargetType(src.place()));
   VLOG(3) << "[CopyAsync fluid -> lite] Bytes = " << bytes << ", src = " << &src
           << ", dst = " << dst << ", src_type = " << src.type();
   MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx);
-  VLOG(3) << "[Lite memory size] Bytes = " << dst->memory_size();
+  VLOG(3) << "[Lite memory size] Bytes = " << bytes;
 }
 
 template <>
-void TensorCopyAsync(framework::LoDTensor* dst, const paddle::lite::Tensor& src,
+void TensorCopyAsync(framework::LoDTensor* dst,
+                     const paddle::lite_api::Tensor& src,
                      const platform::DeviceContext& ctx) {
-  dst->Resize(paddle::framework::make_ddim(src.dims().Vectorize()));
+  dst->Resize(paddle::framework::make_ddim(src.shape()));
   InitDstTensor(dst, src);
   const platform::Place& src_place = GetNativePlace(src.target());
   const platform::Place& dst_place = dst->place();
-  const size_t bytes =
-      static_cast<size_t>(src.numel()) * framework::SizeOfType(dst->type());
-  const void* src_data = src.raw_data();
+  int64_t src_numel = GetLiteTensorNumel(src);
+  const size_t bytes = src_numel * framework::SizeOfType(dst->type());
+  const void* src_data = src.data<void>();
   // When Lite is ready, the source type needs to be modified here.
   void* dst_data = dst->mutable_data(dst_place, dst->type());
   VLOG(3) << "[CopyAsync lite -> fluid] Bytes = " << bytes << ", src = " << &src
           << ", dst = " << dst << ", src_type = " << dst->type();
   MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx);
-  VLOG(3) << "[Lite memory size] Bytes = " << src.memory_size();
+  VLOG(3) << "[Lite memory size] Bytes = " << bytes;
 }
 
 template <>
-void TensorDataShare(paddle::lite::Tensor* dst, framework::LoDTensor* src) {
-  const size_t bytes =
-      static_cast<size_t>(src->numel()) * framework::SizeOfType(src->type());
-  auto buf = std::make_shared<paddle::lite::Buffer>(paddle::lite::Buffer(
-      src->data<void>(), GetLiteTargetType(src->place()), src->memory_size()));
+void TensorDataShare(paddle::lite_api::Tensor* dst, framework::LoDTensor* src) {
   dst->Resize(framework::vectorize(src->dims()));
-  dst->set_precision(GetLitePrecisionType(src->type()));
-  SetLoD(dst->mutable_lod(), src->lod());
-  dst->ResetBuffer(buf, bytes);
+  dst->ShareExternalMemory(src->data<void>(), src->memory_size(),
+                           GetLiteTargetType(src->place()));
+  dst->SetPrecision(GetLitePrecisionType(src->type()));
+  paddle::lite::LoD lite_lod;
+  SetLoD(&lite_lod, src->lod());
+  dst->SetLoD(lite_lod);
 }
 
 template <>
-void TensorDataShare(framework::LoDTensor* dst, paddle::lite::Tensor* src) {
+void TensorDataShare(framework::LoDTensor* dst, paddle::lite_api::Tensor* src) {
   constexpr framework::proto::VarType::Type dtype =
       framework::proto::VarType_Type_FP32;
-  void* src_raw_data = src->raw_data();
+  void* src_raw_data =
+      GetLiteTensorDataPtr(src, GetLitePrecisionType(dtype), src->target());
+  size_t memory_size = GetLiteTensorNumel(*src) * sizeof(float);
   std::shared_ptr<memory::allocation::Allocation> holder(
-      new memory::allocation::Allocation(src_raw_data, src->memory_size(),
+      new memory::allocation::Allocation(src_raw_data, memory_size,
                                          GetNativePlace(src->target())));
-  dst->Resize(paddle::framework::make_ddim(src->dims().Vectorize()));
+  dst->Resize(paddle::framework::make_ddim(src->shape()));
   SetLoD(dst->mutable_lod(), src->lod());
   dst->ResetHolderWithType(holder, dtype);
 }
diff --git a/paddle/fluid/inference/lite/test_engine.cc b/paddle/fluid/inference/lite/test_engine.cc
index d29bcb76be78f151dc606d9f335e9df9ed19b16b..e505af19d5389c074c5777d0235dfa055d1395a7 100644
--- a/paddle/fluid/inference/lite/test_engine.cc
+++ b/paddle/fluid/inference/lite/test_engine.cc
@@ -102,10 +102,10 @@ TEST(EngineManager, engine) {
   config.model_from_memory = true;
   config.valid_places = {
 #ifdef PADDLE_WITH_CUDA
-      paddle::lite::Place({TARGET(kCUDA), PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kCUDA), PRECISION(kFloat)}),
 #endif
-      paddle::lite::Place({TARGET(kX86), PRECISION(kFloat)}),
-      paddle::lite::Place({TARGET(kHost), PRECISION(kAny)}),
+      paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}),
   };
 
   LOG(INFO) << "Create EngineManager";
@@ -118,7 +118,7 @@ TEST(EngineManager, engine) {
   ASSERT_EQ(inference::Singleton<inference::lite::EngineManager>::Global().Has(
                 unique_key),
             true);
-  paddle::lite::Predictor* engine_0 =
+  paddle::lite_api::PaddlePredictor* engine_0 =
       inference::Singleton<inference::lite::EngineManager>::Global().Get(
           unique_key);
   CHECK_NOTNULL(engine_0);
diff --git a/paddle/fluid/inference/lite/test_tensor_utils.cc b/paddle/fluid/inference/lite/test_tensor_utils.cc
index eef7bfb68fe06537d09f3f3e7e5c35283d4739ef..a792fb77d6ad483601402506685e2f91066571da 100644
--- a/paddle/fluid/inference/lite/test_tensor_utils.cc
+++ b/paddle/fluid/inference/lite/test_tensor_utils.cc
@@ -73,6 +73,33 @@ TEST(LiteEngineOp, GetNativeLayoutType) {
   EXPECT_ANY_THROW(GetNativeLayoutType(DataLayoutType::kNHWC));
 }
 
+template <typename T>
+void test_lite_tensor_data_ptr(PrecisionType precision_type) {
+  void* GetLiteTensorDataPtr(paddle::lite_api::Tensor * src,
+                             PrecisionType precision_type,
+                             TargetType target_type);
+  const int count = 4;
+  paddle::lite::Tensor lite_tensor;
+  lite_tensor.Resize({count});
+  auto* lite_tensor_data = lite_tensor.mutable_data<T>();
+  for (size_t i = 0; i < count; ++i) {
+    lite_tensor_data[i] = i;
+  }
+  paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
+  T* data = static_cast<T*>(GetLiteTensorDataPtr(
+      &lite_api_tensor, precision_type, TargetType::kHost));
+  for (size_t i = 0; i < count; ++i) {
+    CHECK_EQ(data[i], static_cast<T>(i)) << "the i-th num is not correct.";
+  }
+}
+
+TEST(LiteEngineOp, GetLiteTensorDataPtr) {
+  test_lite_tensor_data_ptr<int64_t>(PrecisionType::kInt64);
+  test_lite_tensor_data_ptr<int32_t>(PrecisionType::kInt32);
+  test_lite_tensor_data_ptr<int8_t>(PrecisionType::kInt8);
+  EXPECT_ANY_THROW(test_lite_tensor_data_ptr<double>(PrecisionType::kUnk));
+}
+
 void test_tensor_copy(const platform::DeviceContext& ctx) {
   // Create LoDTensor.
   std::vector<float> vector({1, 2, 3, 4});
@@ -83,10 +110,11 @@ void test_tensor_copy(const platform::DeviceContext& ctx) {
   lod_tensor.set_lod(lod);
   // Create lite::Tensor and copy.
   paddle::lite::Tensor lite_tensor;
-  TensorCopyAsync(&lite_tensor, lod_tensor, ctx);
+  paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
+  TensorCopyAsync(&lite_api_tensor, lod_tensor, ctx);
   // Copy to LoDTensor.
   framework::LoDTensor lod_tensor_n;
-  TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx);
+  TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx);
 #ifdef PADDLE_WITH_CUDA
   if (platform::is_gpu_place(ctx.GetPlace())) {
     platform::GpuStreamSync(
@@ -108,10 +136,11 @@ void test_tensor_share(const platform::DeviceContext& ctx) {
   lod_tensor.set_lod(lod);
   // Create lite::Tensor and share.
   paddle::lite::Tensor lite_tensor;
-  TensorDataShare(&lite_tensor, &lod_tensor);
+  paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
+  TensorDataShare(&lite_api_tensor, &lod_tensor);
   // Copy to LoDTensor.
   framework::LoDTensor lod_tensor_n;
-  TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx);
+  TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx);
   std::vector<float> result;
   TensorToVector(lod_tensor_n, ctx, &result);
   ASSERT_EQ(result, vector);
diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
index 28afb87a891fb301b1b5108c9762bf6c88cefb96..5d63aa2ace86cb89917126f3a6fef9d0e9839e8c 100644
--- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
@@ -34,8 +34,11 @@ class ConcatOpConverter : public OpConverter {
       itensors.push_back(engine_->GetITensor(input_name));
     }
     int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis"));
-    PADDLE_ENFORCE(axis > 0,
-                   "The axis attr of Concat op should be large than 0 for trt");
+    PADDLE_ENFORCE_GT(axis, 0, platform::errors::InvalidArgument(
+                                   "The axis attr of Concat"
+                                   " op should be larger than 0 for trt. "
+                                   "But received %d.",
+                                   axis));
 
     auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Concatenation, itensors.data(),
                                        itensors.size());
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 10c212c0b4fa394e3c745bf524ef9d081c4bc3c1..aa03bc44bd629513d96cda541c0b7162629bfdc8 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -100,7 +100,9 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
   TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
   auto* layer = fadd_layer(const_cast<nvinfer1::ITensor*>(X), n_output, n_input,
                            nv_ksize, weight, bias);
-  PADDLE_ENFORCE(layer != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(layer,
+                          platform::errors::Fatal("TensorRT create conv2d"
+                                                  " layer error."));
   layer->setStride(nv_strides);
   layer->setPadding(nv_paddings);
   layer->setNbGroups(groups);
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index c4f0855dbb1ca87b40c396692a812a3cbe06a7b8..dfadb28a6520f983986263b38be69fa48335d485 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -43,13 +43,30 @@ class ElementwiseWeightOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer";
 
-    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+    PADDLE_ENFORCE_EQ(
+        op_desc.Input("X").size(), 1,
+        platform::errors::InvalidArgument(
+            "The input op's Input(\"X\").size() "
+            "should equal to 1, but received Input(\"X\").size() = %u.",
+            op_desc.Input("X").size()));
+    PADDLE_ENFORCE_EQ(
+        op_desc.Input("Y").size(), 1,
+        platform::errors::InvalidArgument(
+            "The input op's Input(\"Y\").size() "
+            "should equal to 1, but received Input(\"Y\").size() = %u.",
+            op_desc.Input("Y").size()));  // Y is a weight
+    PADDLE_ENFORCE_EQ(
+        op_desc.Output("Out").size(), 1,
+        platform::errors::InvalidArgument(
+            "The input op's Output(\"Out\").size() "
+            "should equal to 1, but reveceid Output(\"Out\").size() = %u.",
+            op_desc.Output("Out").size()));
 
     auto* X = engine_->GetITensor(op_desc.Input("X").front());
     auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
-    PADDLE_ENFORCE_NOT_NULL(Y_v);
+    PADDLE_ENFORCE_NOT_NULL(
+        Y_v, platform::errors::NotFound("Variable %s not found in scope.",
+                                        op_desc.Input("Y").front().c_str()));
     auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
     float* weight_data = nullptr;
     weight_data =
@@ -176,9 +193,24 @@ class ElementwiseTensorOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     nvinfer1::ILayer* layer = nullptr;
 
-    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+    PADDLE_ENFORCE_EQ(
+        op_desc.Input("X").size(), 1,
+        platform::errors::InvalidArgument(
+            "The input op's Input(\"X\").size() "
+            "should equal to 1, but received Input(\"X\").size() = %u.",
+            op_desc.Input("X").size()));
+    PADDLE_ENFORCE_EQ(
+        op_desc.Input("Y").size(), 1,
+        platform::errors::InvalidArgument(
+            "The input op's Input(\"Y\").size() "
+            "should equal to 1, but received Input(\"Y\").size() = %u.",
+            op_desc.Input("Y").size()));  // Y is a weight
+    PADDLE_ENFORCE_EQ(
+        op_desc.Output("Out").size(), 1,
+        platform::errors::InvalidArgument(
+            "The input op's Output(\"Out\").size() "
+            "should equal to 1, but received Output(\"Out\").size() = %u.",
+            op_desc.Output("Out").size()));
 
     auto* X = engine_->GetITensor(op_desc.Input("X").front());
     auto* Y = engine_->GetITensor(op_desc.Input("Y").front());
diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.cc b/paddle/fluid/inference/tensorrt/convert/io_converter.cc
index 854f434d93e81237dc85c5df62debcf3b3824b78..d9cf9e2e860018df594ac4d84a4d9fa9b9ba669f 100644
--- a/paddle/fluid/inference/tensorrt/convert/io_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/io_converter.cc
@@ -29,38 +29,67 @@ class DefaultIOConverter : public EngineIOConverter {
   // NOTE out is GPU memory.
   virtual void operator()(const LoDTensor& in, void* out,
                           size_t max_size) override {
-    PADDLE_ENFORCE(out != nullptr);
-    PADDLE_ENFORCE(stream_ != nullptr);
+    PADDLE_ENFORCE_NOT_NULL(out,
+                            platform::errors::InvalidArgument(
+                                "The input param 'out' must not be nullptr."));
+    PADDLE_ENFORCE_NOT_NULL(stream_,
+                            platform::errors::PreconditionNotMet(
+                                "You should set up stream_ by SetStream() "
+                                "before you call the operator()."));
     const auto& place = in.place();
     size_t size = in.memory_size();
-    PADDLE_ENFORCE_LE(size, max_size);
+    PADDLE_ENFORCE_LE(
+        size, max_size,
+        platform::errors::InvalidArgument(
+            "The input Tensor in's memory_size shoule be less than or equal to "
+            "the input max_size. But in's memory_size = %u, max_size = %u.",
+            size, max_size));
     if (is_cpu_place(place)) {
-      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out, in.data<float>(), size,
-                                           cudaMemcpyHostToDevice, *stream_));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
+          out, in.data<float>(), size, cudaMemcpyHostToDevice, *stream_));
     } else if (is_gpu_place(place)) {
-      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out, in.data<float>(), size,
-                                           cudaMemcpyDeviceToDevice, *stream_));
+      PADDLE_ENFORCE_EQ(
+          0, cudaMemcpyAsync(out, in.data<float>(), size,
+                             cudaMemcpyDeviceToDevice, *stream_),
+          platform::errors::External(
+              "cudaMemcpyAsync(cudaMemcpyDeviceToDevice) error."));
     } else {
-      PADDLE_THROW("Unknown device for converter");
+      PADDLE_THROW(platform::errors::NotFound("Unknown device for converter"));
     }
     cudaStreamSynchronize(*stream_);
   }
   // NOTE in is GPU memory.
   virtual void operator()(const void* in, LoDTensor* out,
                           size_t max_size) override {
-    PADDLE_ENFORCE(in != nullptr);
-    PADDLE_ENFORCE(stream_ != nullptr);
+    PADDLE_ENFORCE_NOT_NULL(in,
+                            platform::errors::InvalidArgument(
+                                "The input param 'in' must not be nullptr."));
+    PADDLE_ENFORCE_NOT_NULL(stream_,
+                            platform::errors::PreconditionNotMet(
+                                "You should set up stream_ by SetStream() "
+                                "before you call the operator()."));
     const auto& place = out->place();
     size_t size = out->memory_size();
-    PADDLE_ENFORCE_LE(size, max_size);
+    PADDLE_ENFORCE_LE(
+        size, max_size,
+        platform::errors::InvalidArgument(
+            "The input Tensor out's memory_size shoule be less than or equal "
+            "to the input max_size. "
+            "But out's memory_size = %u, max_size = %u.",
+            size, max_size));
     if (is_cpu_place(place)) {
       PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data<float>(), in, size,
-                                           cudaMemcpyDeviceToHost, *stream_));
+                                           cudaMemcpyDeviceToHost, *stream_),
+                        platform::errors::External(
+                            "cudaMemcpyAsync(cudaMemcpyDeviceToHost) error."));
     } else if (is_gpu_place(place)) {
-      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data<float>(), in, size,
-                                           cudaMemcpyDeviceToDevice, *stream_));
+      PADDLE_ENFORCE_EQ(
+          0, cudaMemcpyAsync(out->data<float>(), in, size,
+                             cudaMemcpyDeviceToDevice, *stream_),
+          platform::errors::External(
+              "cudaMemcpyAsync(cudaMemcpyDeviceToDevice) error."));
     } else {
-      PADDLE_THROW("Unknown device for converter");
+      PADDLE_THROW(platform::errors::NotFound("Unknown device for converter"));
     }
     cudaStreamSynchronize(*stream_);
   }
diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.h b/paddle/fluid/inference/tensorrt/convert/io_converter.h
index 5daa242f6ab802a50fa6105f0102b817b700f461..58c178028b8b275b57f5c298534bd1d31aede234 100644
--- a/paddle/fluid/inference/tensorrt/convert/io_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/io_converter.h
@@ -44,10 +44,14 @@ class EngineIOConverter {
 
   static void ConvertInput(const std::string& op_type, const LoDTensor& in,
                            void* out, size_t max_size, cudaStream_t* stream) {
-    PADDLE_ENFORCE(stream != nullptr);
+    PADDLE_ENFORCE_NOT_NULL(stream,
+                            platform::errors::InvalidArgument(
+                                "The input stream must not be nullptr."));
     auto* converter = Registry<EngineIOConverter>::Global().Lookup(
         op_type, "default" /* default_type */);
-    PADDLE_ENFORCE_NOT_NULL(converter);
+    PADDLE_ENFORCE_NOT_NULL(
+        converter, platform::errors::Unimplemented(
+                       "The %s in is not supported yet.", op_type.c_str()));
     converter->SetStream(stream);
     (*converter)(in, out, max_size);
   }
@@ -55,10 +59,14 @@ class EngineIOConverter {
   static void ConvertOutput(const std::string& op_type, const void* in,
                             LoDTensor* out, size_t max_size,
                             cudaStream_t* stream) {
-    PADDLE_ENFORCE(stream != nullptr);
+    PADDLE_ENFORCE_NOT_NULL(stream,
+                            platform::errors::InvalidArgument(
+                                "The input stream must not be nullptr."));
     auto* converter = Registry<EngineIOConverter>::Global().Lookup(
         op_type, "default" /* default_type */);
-    PADDLE_ENFORCE_NOT_NULL(converter);
+    PADDLE_ENFORCE_NOT_NULL(
+        converter, platform::errors::Unimplemented(
+                       "The %s in not supported yet.", op_type.c_str()));
     converter->SetStream(stream);
     (*converter)(in, out, max_size);
   }
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index f4b0f5f23d8fda064c29534b56868beae79f65c0..ac0a04b9a116d907fd69c0ca58d3ae7e82921dab 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -53,7 +53,12 @@ class OpConverter {
     OpConverter* it{nullptr};
 
     if (op_desc.Type() == "mul") {
-      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
+      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL,
+                        platform::errors::InvalidArgument(
+                            "The input op mul's Input(\"Y\")."
+                            "size() should equal to 1, but reveceid "
+                            "Input(\"Y\").size() = %u.",
+                            op_desc.Input("Y").size()));
       std::string Y = op_desc.Input("Y")[0];
       if (parameters.count(Y)) {
         it = Registry<OpConverter>::Global().Lookup("fc");
@@ -66,38 +71,51 @@ class OpConverter {
       // static std::unordered_set<std::string> add_weight_op_set {"add", "mul",
       // "sub", "div"};
       static std::unordered_set<std::string> add_weight_op_set{"add", "mul"};
-      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
+      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL,
+                        platform::errors::InvalidArgument(
+                            "The input op's Input(\"Y\")."
+                            "size() should equal to 1, but reveceid "
+                            "Input(\"Y\").size() = %u.",
+                            op_desc.Input("Y").size()));
       int op_type_len = op_desc.Type().size();
       std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len);
       std::string Y = op_desc.Input("Y")[0];
       if (parameters.count(Y)) {
-        PADDLE_ENFORCE(add_weight_op_set.count(op_type) > 0,
-                       "Unsupported elementwise type" + op_type);
+        PADDLE_ENFORCE_GT(
+            add_weight_op_set.count(op_type), 0,
+            platform::errors::Unimplemented("Unsupported elementwise type %s",
+                                            op_type.c_str()));
         it = Registry<OpConverter>::Global().Lookup("elementwise_" + op_type +
                                                     "_weight");
-        PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
-                                op_desc.Type());
+        PADDLE_ENFORCE_NOT_NULL(
+            it, platform::errors::Unimplemented(
+                    "no OpConverter for optype [%s]", op_desc.Type()));
       } else {
-        PADDLE_ENFORCE(add_tensor_op_set.count(op_type) > 0,
-                       "Unsupported elementwise type" + op_type);
+        PADDLE_ENFORCE_GT(
+            add_tensor_op_set.count(op_type), 0,
+            platform::errors::Unimplemented("Unsupported elementwise type %s",
+                                            op_type.c_str()));
         it = Registry<OpConverter>::Global().Lookup("elementwise_" + op_type +
                                                     "_tensor");
       }
-      PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
-                              op_desc.Type());
+      PADDLE_ENFORCE_NOT_NULL(
+          it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                              op_desc.Type()));
     }
 
     if (op_desc.Type() == "depthwise_conv2d") {
       it = Registry<OpConverter>::Global().Lookup("conv2d");
-      PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
-                              op_desc.Type());
+      PADDLE_ENFORCE_NOT_NULL(
+          it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                              op_desc.Type()));
     }
 
     if (!it) {
       it = Registry<OpConverter>::Global().Lookup(op_desc.Type());
     }
-    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
-                            op_desc.Type());
+    PADDLE_ENFORCE_NOT_NULL(
+        it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                            op_desc.Type()));
 
     it->SetEngine(engine);
     (*it)(op, scope, test_mode);
@@ -149,9 +167,13 @@ class OpConverter {
     for (auto& input : inputs) {
       if (parameters.count(input)) continue;
       auto* var = block_desc->FindVar(input);
-      PADDLE_ENFORCE(var, "no variable called %s", input);
-      PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
-                        "TensorRT engine only takes LoDTensor as input");
+      PADDLE_ENFORCE_NOT_NULL(
+          var, platform::errors::NotFound("no variable called %s in block.",
+                                          input.c_str()));
+      PADDLE_ENFORCE_EQ(
+          var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
+          platform::errors::InvalidArgument("TensorRT engine only takes "
+                                            "LoDTensor as input"));
       auto var_shape = var->GetShape();
       if (engine->with_dynamic_shape()) {
 #if IS_TRT_VERSION_GE(6000)
diff --git a/paddle/fluid/inference/tensorrt/convert/pad_op.cc b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
index a1b0f3b4310a020d4bbf8d7c04c9447d3e0e72f7..dd594404d3316ada6e20624c074368f241ca5cdd 100644
--- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
@@ -39,9 +39,22 @@ class PadOpConverter : public OpConverter {
     nvinfer1::Dims input_shape = input->getDimensions();
     int nbDims = input_shape.nbDims;
     int pad_size = static_cast<int>(paddings.size());
-    PADDLE_ENFORCE_GE(nbDims, 2);
-    PADDLE_ENFORCE_EQ((nbDims + 1) * 2, pad_size);
-    PADDLE_ENFORCE(pad_value == 0.0, "The pad layer of TRT only support zero.");
+    PADDLE_ENFORCE_GE(
+        nbDims, 2,
+        platform::errors::InvalidArgument(
+            "Input X[0]'s dimension should greater than or equal to 2. "
+            "But received %d.",
+            nbDims));
+    PADDLE_ENFORCE_EQ(
+        (nbDims + 1) * 2, pad_size,
+        platform::errors::InvalidArgument("Input X[0]'s dimension(nbDims for "
+                                          "short) should meet the condition:"
+                                          "(nbDims + 1) * 2 == pad_size. But "
+                                          "received nbDims:%d, pad_size:%d.",
+                                          nbDims, pad_size));
+    PADDLE_ENFORCE_EQ(pad_value, 0.0,
+                      platform::errors::InvalidArgument(
+                          "The pad layer of TRT only support zero."));
 
     nvinfer1::DimsHW pre_pad(paddings[pad_size - 4], paddings[pad_size - 2]);
     nvinfer1::DimsHW post_pad(paddings[pad_size - 3], paddings[pad_size - 1]);
@@ -50,7 +63,9 @@ class PadOpConverter : public OpConverter {
                                        *const_cast<nvinfer1::ITensor*>(input),
                                        pre_pad, post_pad);
 
-    PADDLE_ENFORCE(layer != nullptr);
+    PADDLE_ENFORCE_NOT_NULL(layer,
+                            platform::errors::External(
+                                "add padding layer to tensorrt engine error"));
     auto output_name = op_desc.Output("Out")[0];
     RreplenishLayerAndOutput(layer, "pad", {output_name}, test_mode);
   }
diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
index 2a76317eea1b78d13b2ff9d49cc86020ae3cfe96..3c3fead3d361bdb87d8a52dc9a5e986da3975df3 100644
--- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
@@ -23,9 +23,8 @@ class SliceOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
-// This OP is implemented by trt dynamic shpae plugin.
-// Dynamic shape plugin requires TRT version greater than 6.0.
-#if IS_TRT_VERSION_GE(6000)
+    // This OP is implemented by trt dynamic shpae plugin.
+    // Dynamic shape plugin requires TRT version greater than 6.0.
     VLOG(4) << "convert slice op to tensorrt layer";
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
@@ -38,27 +37,65 @@ class SliceOpConverter : public OpConverter {
     std::vector<int> ends =
         BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("ends"));
 
+    PADDLE_ENFORCE_EQ(
+        starts.size(), axes.size(),
+        platform::errors::InvalidArgument(
+            "The size of starts must be equal to the size of axes."));
+    PADDLE_ENFORCE_EQ(
+        ends.size(), axes.size(),
+        platform::errors::InvalidArgument(
+            "The size of ends must be equal to the size of axes."));
+
+    auto input_dims = input->getDimensions();
+    if (!engine_->with_dynamic_shape()) {
+      // notice that input shape is [CHW] without batch axis when input has
+      // static shape
+      for (size_t i = input_dims.nbDims; i > 0; i--) {
+        input_dims.d[i] = input_dims.d[i - 1];
+      }
+      input_dims.d[0] = 1;  // fake batchsize, not useful here
+      for (size_t i = 0; i < axes.size(); i++) {
+        // split on batch is not supported in TensorRT
+        PADDLE_ENFORCE_NE(axes[i], 0, platform::errors::InvalidArgument(
+                                          "Invalid slice axis. Slice on batch "
+                                          "axis is not supported in TensorRT"));
+        if (starts[i] < 0) {
+          starts[i] = std::max(starts[i] + input_dims.d[axes[i]], 0);
+        }
+        if (ends[i] < 0) {
+          ends[i] = std::max(ends[i] + input_dims.d[axes[i]], 0);
+        }
+        ends[i] = std::min(ends[i], input_dims.d[axes[i]]);
+        PADDLE_ENFORCE_GT(
+            ends[i], starts[i],
+            platform::errors::InvalidArgument(
+                "Attr(ends) should be greater than attr(starts) in "
+                "slice op. But received ends = %d, starts = %d.",
+                ends[i], starts[i]));
+      }
+    }
+
     nvinfer1::ILayer* layer = nullptr;
     if (engine_->with_dynamic_shape()) {
+#if IS_TRT_VERSION_GE(6000)
       bool ban_fp16 = engine_->disable_trt_plugin_fp16();
       plugin::SlicePluginDynamic* plugin =
-          new plugin::SlicePluginDynamic(starts, ends, ends, ban_fp16);
+          new plugin::SlicePluginDynamic(starts, ends, axes, ban_fp16);
       layer = engine_->AddPluginV2(&input, 1, plugin);
-    } else {
+#else
       PADDLE_THROW(platform::errors::Fatal(
-          "You are running the Ernie(Bert) model in static"
-          "shape mode, which is not supported for the time being.\n"
-          "You can use the config.SetTRTDynamicShapeInfo(...) interface"
-          " to set the shape information to run the dynamic shape mode."));
+          "You are running the TRT Dynamic Shape mode, need to confirm that "
+          "your TRT version is no less than 6.0"));
+#endif
+    } else {
+      bool ban_fp16 = engine_->disable_trt_plugin_fp16();
+      plugin::SlicePlugin* plugin =
+          new plugin::SlicePlugin(starts, ends, axes, ban_fp16);
+      layer = engine_->AddPlugin(&input, 1, plugin);
     }
 
     auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(layer, "skip_layernorm", {output_name}, test_mode);
-#else
-    PADDLE_THROW(platform::errors::Fatal(
-        "You are running the TRT Dynamic Shape mode, need to confirm that "
-        "your TRT version is no less than 6.0"));
-#endif
+    RreplenishLayerAndOutput(layer, "slice", {output_name}, test_mode);
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/swish_op.cc b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
index 4b3e1c9e70a4a94808c94c81fcc773482f0574e4..e220d80f0d79da5eab98aa7a18a5093f9f4a55c4 100644
--- a/paddle/fluid/inference/tensorrt/convert/swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
@@ -28,11 +28,20 @@ class SwishOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     int input_num = op_desc.Input("X").size();
-    PADDLE_ENFORCE(input_num == 1);
+    PADDLE_ENFORCE_EQ(input_num, 1,
+                      platform::errors::InvalidArgument(
+                          "The input X's size must equal to 1 in TRT swish op."
+                          " But received X's size %d.",
+                          input_num));
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
     // Get output
     size_t output_num = op_desc.Output("Out").size();
-    PADDLE_ENFORCE(output_num == 1);
+    PADDLE_ENFORCE_EQ(
+        output_num, 1UL,
+        platform::errors::InvalidArgument(
+            "The ouput Out's size must equal to 1 in TRT swish op. "
+            "But received Out's size %u.",
+            output_num));
     // Get attrs
     float beta = BOOST_GET_CONST(float, op_desc.GetAttr("beta"));
 
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index 3c48c8192f6b06e5a0ba005738383b46bc550ecb..cfb25eb2ba82763950babda5385649d31d2e9185 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -49,7 +49,10 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place,
                      const platform::DeviceContext& ctx) {
   auto dims = tensor->dims();
   size_t num_elements = analysis::AccuDims(dims, dims.size());
-  PADDLE_ENFORCE_GT(num_elements, 0);
+  PADDLE_ENFORCE_GT(
+      num_elements, 0UL,
+      platform::errors::PermissionDenied("RandomizeTensor only can be used for "
+                                         "tensor which dims is not zero."));
 
   platform::CPUPlace cpu_place;
   framework::LoDTensor temp_tensor;
@@ -79,7 +82,8 @@ class TRTConvertValidation {
         scope_(scope),
         if_add_batch_(if_add_batch),
         max_batch_size_(max_batch_size) {
-    PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
+    PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0,
+                      platform::errors::External("cudaStreamCreate error."));
     engine_.reset(new TensorRTEngine(max_batch_size, workspace_size));
     engine_->InitNetwork();
   }
@@ -154,7 +158,12 @@ class TRTConvertValidation {
   void Execute(int batch_size,
                std::unordered_set<std::string> neglected_output = {}) {
     // Execute Fluid Op
-    PADDLE_ENFORCE_LE(batch_size, max_batch_size_);
+    PADDLE_ENFORCE_LE(batch_size, max_batch_size_,
+                      platform::errors::InvalidArgument(
+                          "Runtime batch_size should be less than or equal to "
+                          "max_batch_size_. "
+                          "But received batch_size:%d, max_batch_size_:%d",
+                          batch_size, max_batch_size_));
     platform::CUDADeviceContext ctx(place_);
     op_->Run(scope_, place_);
     cudaStreamSynchronize(stream_);
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 22be877493272cd393538fd4f04184e77d38e2db..754979f77acd7a3b4818cdf16ef9c525bf1d82ea 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -63,11 +63,13 @@ void TensorRTEngine::Execute(int batch_size, std::vector<void *> *buffers,
 void TensorRTEngine::FreezeNetwork() {
   freshDeviceId();
   VLOG(3) << "TRT to freeze network";
-  PADDLE_ENFORCE(infer_builder_ != nullptr,
-                 "Call InitNetwork first to initialize network.");
-  PADDLE_ENFORCE_EQ(network() != nullptr, true,
-                    platform::errors::InvalidArgument(
-                        "Call InitNetwork first to initialize network."));
+  PADDLE_ENFORCE_NOT_NULL(infer_builder_,
+                          platform::errors::InvalidArgument(
+                              "Inference builder of TRT is null. Please make "
+                              "sure you call InitNetwork first."));
+  PADDLE_ENFORCE_NOT_NULL(network(),
+                          platform::errors::InvalidArgument(
+                              "Call InitNetwork first to initialize network."));
   // build engine.
   infer_builder_->setMaxBatchSize(max_batch_);
   infer_builder_->setMaxWorkspaceSize(max_workspace_);
@@ -210,7 +212,10 @@ void TensorRTEngine::FreezeNetwork() {
   } else {
     infer_engine_.reset(infer_builder_->buildCudaEngine(*network()));
   }
-  PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!");
+  PADDLE_ENFORCE_NOT_NULL(
+      infer_engine_, platform::errors::Fatal(
+                         "Build TensorRT cuda engine failed! Please recheck "
+                         "you configurations related to paddle-TensorRT."));
 }
 
 nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
@@ -220,8 +225,16 @@ nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
                     platform::errors::InvalidArgument(
                         "The TRT network should be initialized first."));
   auto *input = network()->addInput(name.c_str(), dtype, dims);
-  PADDLE_ENFORCE(input, "infer network add input %s failed", name);
-  PADDLE_ENFORCE(input->isNetworkInput());
+  PADDLE_ENFORCE_NOT_NULL(
+      input, platform::errors::InvalidArgument("Adding input %s failed in "
+                                               "TensorRT inference network. "
+                                               "Please recheck your input.",
+                                               name));
+  PADDLE_ENFORCE_EQ(input->isNetworkInput(), true,
+                    platform::errors::InvalidArgument(
+                        "Input %s is not the input of TRT inference network. "
+                        "Please recheck your input.",
+                        name));
   TensorRTEngine::SetITensor(name, input);
   return input;
 }
@@ -230,31 +243,53 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
                                    const std::string &name) {
   auto *output = layer->getOutput(offset);
   SetITensor(name, output);
-  PADDLE_ENFORCE(output != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(
+      output, platform::errors::InvalidArgument(
+                  "The output %s of TRT engine should not be null.", name));
   output->setName(name.c_str());
-  PADDLE_ENFORCE(!output->isNetworkInput());
+  PADDLE_ENFORCE_EQ(output->isNetworkInput(), false,
+                    platform::errors::InvalidArgument(
+                        "The output %s of TRT engine should not be the input "
+                        "of the network at the same time.",
+                        name));
   network()->markOutput(*output);
-  PADDLE_ENFORCE(output->isNetworkOutput());
+  PADDLE_ENFORCE_EQ(
+      output->isNetworkOutput(), true,
+      platform::errors::InvalidArgument(
+          "The output %s of TRT engine should be the output of the network.",
+          name));
 }
 
 void TensorRTEngine::DeclareOutput(const std::string &name) {
   auto *output = TensorRTEngine::GetITensor(name);
-  PADDLE_ENFORCE(output != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(
+      output, platform::errors::InvalidArgument(
+                  "The output %s of TRT engine should not be null.", name));
   output->setName(name.c_str());
-  PADDLE_ENFORCE(!output->isNetworkInput());
+  PADDLE_ENFORCE_EQ(output->isNetworkInput(), false,
+                    platform::errors::InvalidArgument(
+                        "The output %s of TRT engine should not be the input "
+                        "of the network at the same time.",
+                        name));
   network()->markOutput(*output);
 }
 
 void TensorRTEngine::SetITensor(const std::string &name,
                                 nvinfer1::ITensor *tensor) {
-  PADDLE_ENFORCE(tensor != nullptr);
-  PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s",
-                    name);
+  PADDLE_ENFORCE_NOT_NULL(
+      tensor, platform::errors::InvalidArgument(
+                  "Tensor named %s of TRT engine should not be null.", name));
+  PADDLE_ENFORCE_EQ(
+      0, itensor_map_.count(name),
+      platform::errors::InvalidArgument(
+          "Tensor named %s of TRT engine should not be duplicated", name));
   itensor_map_[name] = tensor;
 }
 
 nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
-  PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name);
+  PADDLE_ENFORCE_EQ(itensor_map_.count(name), true,
+                    platform::errors::NotFound(
+                        "Tensor named %s is not found in TRT engine", name));
   return itensor_map_[name];
 }
 
@@ -271,11 +306,11 @@ float *TensorRTEngine::GetWeightCPUData(const std::string &name,
   std::string splitter = "__";
   std::string name_with_suffix = name + splitter + name_suffix;
   platform::CPUPlace cpu_place;
-  PADDLE_ENFORCE_EQ(
-      weight_map.count(name_with_suffix), 0,
-      "During TRT Op converter: We set weight %s with the same name "
-      "twice into the weight_map",
-      name_with_suffix);
+  PADDLE_ENFORCE_EQ(weight_map.count(name_with_suffix), 0,
+                    platform::errors::AlreadyExists(
+                        "The weight named %s is set into the weight map "
+                        "twice in TRT OP converter.",
+                        name_with_suffix));
   weight_map[name_with_suffix].reset(new framework::Tensor());
   weight_map[name_with_suffix]->Resize(weight_tensor->dims());
   TensorCopySync(*weight_tensor, cpu_place, weight_map[name_with_suffix].get());
@@ -297,7 +332,10 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
 void TensorRTEngine::freshDeviceId() {
   int count;
   cudaGetDeviceCount(&count);
-  PADDLE_ENFORCE_LT(device_id_, count);
+  PADDLE_ENFORCE_LT(device_id_, count,
+                    platform::errors::OutOfRange(
+                        "Device id %d exceeds the current device count: %d.",
+                        device_id_, count));
   cudaSetDevice(device_id_);
 }
 
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 1a3413657ce6fac41603d691dcdb61ddb1d6320a..a85ed483c1d12c3f2eecc5ed4bcb99937397a765 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -196,8 +196,10 @@ class TensorRTEngine {
   }
 
   nvinfer1::IHostMemory* Serialize() {
-    PADDLE_ENFORCE(infer_engine_ != nullptr,
-                   "You should build engine first and then serialize");
+    PADDLE_ENFORCE_NOT_NULL(
+        infer_engine_,
+        platform::errors::InvalidArgument(
+            "The TensorRT engine must be built first before serialization"));
     ihost_memory_.reset(infer_engine_->serialize());
     return ihost_memory_.get();
   }
@@ -222,8 +224,14 @@ class TensorRTEngine {
           engine_serialized_data.c_str(), engine_serialized_data.size(),
           &inference::Singleton<plugin::PluginFactoryTensorRT>::Global()));
     }
-    PADDLE_ENFORCE(infer_engine_ != nullptr,
-                   "build cuda engine failed when deserialize engine info.!");
+    PADDLE_ENFORCE_NOT_NULL(
+        infer_engine_,
+        platform::errors::Fatal(
+            "Building TRT cuda engine failed when deserializing engine info. "
+            "Please check:\n1. Your TRT serialization is generated and loaded "
+            "on the same GPU architecture;\n2. The Paddle Inference version of "
+            "generating serialization file and doing inference are "
+            "consistent."));
   }
 
   void SetRuntimeBatch(size_t batch_size);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index a5b71356d0eca43555f4190b8cac2055a3eb679c..31128ba8c5d42acac0dff321adbc40dbb0ce0c19 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -31,6 +31,7 @@ struct SimpleOpTypeSetTeller : public Teller {
     teller_set.insert("fused_embedding_eltwise_layernorm");
     teller_set.insert("multihead_matmul");
     teller_set.insert("skip_layernorm");
+    teller_set.insert("slice");
 #endif
   }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
index 0ec803fe64afadd970777e3b0d0ab5d37fcc4d22..457d9dd87375477926480bce0a84e8f89c409698 100644
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
@@ -56,14 +56,27 @@ __global__ void elementwise_kernel(const size_t total, const T *x_data,
 
 nvinfer1::Dims ElementWisePlugin::getOutputDimensions(
     int index, const nvinfer1::Dims *input_dims, int num_inputs) {
-  PADDLE_ENFORCE_EQ(index, 0);
-  PADDLE_ENFORCE_EQ(num_inputs, 2);
-  PADDLE_ENFORCE_NOT_NULL(input_dims);
+  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
+                                  "There is only one output in TRT elementwise "
+                                  "op plugin, but got output index: %d.",
+                                  index));
+  PADDLE_ENFORCE_EQ(num_inputs, 2, platform::errors::InvalidArgument(
+                                       "There are 2 inputs in TRT elementwise "
+                                       "op plugin, but got input number: %d.",
+                                       num_inputs));
+  PADDLE_ENFORCE_NOT_NULL(
+      input_dims,
+      platform::errors::InvalidArgument(
+          "The input dims of TRT elementwise op plugin should not be null."));
   return input_dims[0];
 }
 
 int ElementWisePlugin::initialize() {
-  PADDLE_ENFORCE_GT(dims_y_.nbDims, 0);
+  PADDLE_ENFORCE_GT(dims_y_.nbDims, 0,
+                    platform::errors::InvalidArgument(
+                        "The dimension of input Y of TRT elementwise op plugin "
+                        "should be greater than 0, but got %d.",
+                        dims_y_.nbDims));
 
   axis_ = (axis_ == -1) ? dims_x_.nbDims - dims_y_.nbDims : axis_;
   int trimed_nb_dims = dims_y_.nbDims;
@@ -74,8 +87,18 @@ int ElementWisePlugin::initialize() {
   }
   dims_y_.nbDims = trimed_nb_dims;
 
-  PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_);
-  PADDLE_ENFORCE_LT(axis_, dims_x_.nbDims);
+  PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_,
+                    platform::errors::InvalidArgument(
+                        "We expect [number of x dims] >= [number of y dims + "
+                        "axis] in TRT elementwise op plugin, but got [number "
+                        "of x dims] = %d, [number of y dims + axis] = %d.",
+                        dims_x_.nbDims, dims_y_.nbDims + axis_));
+  PADDLE_ENFORCE_LT(
+      axis_, dims_x_.nbDims,
+      platform::errors::InvalidArgument("We expect [axis] < [number of x dims] "
+                                        "in TRT elementwise op plugin, but got "
+                                        "[axis] = %d, [number of x dims] = %d.",
+                                        axis_, dims_x_.nbDims));
 
   prev_size_ = 1;
   midd_size_ = 1;
@@ -86,7 +109,9 @@ int ElementWisePlugin::initialize() {
 
   for (int i = 0; i < dims_y_.nbDims; ++i) {
     PADDLE_ENFORCE_EQ(dims_x_.d[i + axis_], dims_y_.d[i],
-                      "Broadcast dimension mismatch.");
+                      platform::errors::InvalidArgument(
+                          "Broadcast dimension mismatch. The dims of input Y "
+                          "should be a subsequence of X."));
     midd_size_ *= dims_y_.d[i];
   }
 
@@ -221,7 +246,10 @@ int ElementwisePluginDynamic::enqueue(
     elementwise_kernel<<<block, thread, 0, stream>>>(
         num, x, y, out, prev_size, midd_size, post_size, details::Mul<float>());
   } else {
-    PADDLE_THROW("Not implemented.");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Paddle-TRT only support elementwise operation: {add, mul} currently, "
+        "but got %s.",
+        type_));
   }
 
   return cudaGetLastError() != cudaSuccess;
diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
index 4fb1d8241084d7af787c32949b63819cddbfcb82..5c56270627a6fcb49eb0713d2282c224719fc38d 100644
--- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
@@ -26,8 +26,10 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
-// Dynamic Plugin below.
-#if IS_TRT_VERSION_GE(6000)
+SlicePlugin *CreateSlicePluginDeserialize(const void *buffer, size_t length) {
+  return new SlicePlugin(buffer, length);
+}
+REGISTER_TRT_PLUGIN("slice_plugin", CreateSlicePluginDeserialize);
 
 template <typename T>
 __global__ void SliceKernel(int num, int dims, const T *input,
@@ -56,11 +58,196 @@ __global__ void SliceKernel(int num, int dims, const T *input,
   }
 }
 
+SlicePlugin::SlicePlugin(std::vector<int> starts, std::vector<int> ends,
+                         std::vector<int> axes, bool ban_fp16)
+    : starts_(starts), ends_(ends), axes_(axes), ban_fp16_(ban_fp16) {
+  cudaEventCreate(&copy_event_);
+  cudaStreamCreate(&copy_stream_);
+}
+
+SlicePlugin::SlicePlugin(void const *serial_data, size_t serial_length) {
+  deserializeBase(serial_data, serial_length);
+  DeserializeValue(&serial_data, &serial_length, &starts_);
+  DeserializeValue(&serial_data, &serial_length, &ends_);
+  DeserializeValue(&serial_data, &serial_length, &axes_);
+  DeserializeValue(&serial_data, &serial_length, &ban_fp16_);
+  cudaEventCreate(&copy_event_);
+  cudaStreamCreate(&copy_stream_);
+}
+
+SlicePlugin::~SlicePlugin() {
+  cudaStreamDestroy(copy_stream_);
+  cudaEventDestroy(copy_event_);
+  cudaFree(offset_temp_data_);
+}
+
+SlicePlugin *SlicePlugin::clone() const {
+  return new SlicePlugin(starts_, ends_, axes_, ban_fp16_);
+}
+
+bool SlicePlugin::supportsFormat(nvinfer1::DataType type,
+                                 nvinfer1::PluginFormat format) const {
+#ifdef SUPPORTS_CUDA_FP16
+  return ((type == nvinfer1::DataType::kFLOAT ||
+           type == nvinfer1::DataType::kHALF) &&
+          (format == nvinfer1::PluginFormat::kNCHW));
+#else
+  return ((type == nvinfer1::DataType::kFLOAT) &&
+          (format == nvinfer1::PluginFormat::kNCHW));
+#endif
+}
+
+nvinfer1::Dims SlicePlugin::getOutputDimensions(int index,
+                                                const nvinfer1::Dims *inputs,
+                                                int nb_input_dims) {
+  auto in_dims = inputs[0];
+  nvinfer1::Dims out_dims = in_dims;
+  for (size_t i = 0; i < axes_.size(); i++) {
+    int start = starts_[i];
+    int end = ends_[i];
+    out_dims.d[axes_[i] - 1] = end - start;
+  }
+  return out_dims;
+}
+
+int SlicePlugin::enqueue(int batch_size, const void *const *inputs,
+                         void **outputs, void *workspace, cudaStream_t stream) {
+  auto input_dims = getInputDims(0);
+
+  // notice input dims is [C, H, W], add input batch dim here
+  auto out_dims = getOutputDimensions(0, &input_dims, 1);
+  input_dims.nbDims += 1;
+  out_dims.nbDims += 1;
+  for (auto i = input_dims.nbDims; i > 0; --i) {
+    input_dims.d[i] = input_dims.d[i - 1];
+    out_dims.d[i] = out_dims.d[i - 1];
+  }
+  input_dims.d[0] = batch_size;
+  out_dims.d[0] = batch_size;
+
+  auto num_dims = input_dims.nbDims;
+  size_t out_num = ProductDim(out_dims);
+
+  std::vector<int> seg_offsets;
+  std::vector<int> offsets;
+  std::vector<int> extends;
+
+  offsets.resize(num_dims);
+  extends.resize(num_dims);
+  seg_offsets.resize(num_dims);
+
+  seg_offsets[num_dims - 1] = 1;
+  for (int i = num_dims - 2; i >= 0; i--) {
+    seg_offsets[i] = input_dims.d[i + 1] * seg_offsets[i + 1];
+  }
+  for (size_t i = 0; i < num_dims; ++i) {
+    offsets[i] = 0;
+    extends[i] = out_dims.d[i];
+  }
+  for (size_t i = 0; i < axes_.size(); ++i) {
+    offsets[axes_[i]] = starts_[i];
+  }
+
+  std::vector<int> offset_info;
+  for (size_t i = 0; i < num_dims; ++i) {
+    offset_info.push_back(offsets[i]);
+    offset_info.push_back(extends[i]);
+    offset_info.push_back(seg_offsets[i]);
+  }
+
+  if (offset_temp_data_ == nullptr) {
+    cudaMalloc(&offset_temp_data_, 3 * num_dims * sizeof(int));
+  }
+
+  cudaMemcpyAsync(offset_temp_data_, offset_info.data(),
+                  sizeof(int) * 3 * num_dims, cudaMemcpyHostToDevice,
+                  copy_stream_);
+
+  cudaEventRecord(copy_event_, copy_stream_);
+  cudaStreamWaitEvent(stream, copy_event_, 0);
+
+  int threads = 256;
+  int blocks = (out_num + threads - 1) / threads;
+  auto input_type = getDataType();
+  if (input_type == nvinfer1::DataType::kFLOAT) {
+    const float *input1 = static_cast<const float *>(inputs[0]);
+    float *output = static_cast<float *>(outputs[0]);
+    SliceKernel<float><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>(
+        out_num, num_dims, input1, offset_temp_data_, output);
+  } else if (input_type == nvinfer1::DataType::kHALF) {
+#ifdef SUPPORTS_CUDA_FP16
+    const half *input1 = static_cast<const half *>(inputs[0]);
+    half *output = static_cast<half *>(outputs[0]);
+    SliceKernel<half><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>(
+        out_num, num_dims, input1, offset_temp_data_, output);
+#else
+    PADDLE_THROW(platform::errors::Fatal(
+        "The cuda archs you specific should greater than 600."));
+#endif
+  } else {
+    PADDLE_THROW(platform::errors::Fatal(
+        "The Slice TRT Plugin's input type should be float or half."));
+  }
+  return cudaGetLastError() != cudaSuccess;
+}
+
+size_t SlicePlugin::getSerializationSize() {
+  return getBaseSerializationSize() + SerializedSize(getPluginType()) +
+         SerializedSize(starts_) + SerializedSize(ends_) +
+         SerializedSize(axes_) + SerializedSize(ban_fp16_);
+}
+
+void SlicePlugin::serialize(void *buffer) {
+  SerializeValue(&buffer, getPluginType());
+  serializeBase(buffer);
+  SerializeValue(&buffer, starts_);
+  SerializeValue(&buffer, ends_);
+  SerializeValue(&buffer, axes_);
+  SerializeValue(&buffer, ban_fp16_);
+}
+
+// Dynamic Plugin below.
+#if IS_TRT_VERSION_GE(6000)
+SlicePluginDynamic::SlicePluginDynamic(std::vector<int> starts,
+                                       std::vector<int> ends,
+                                       std::vector<int> axes, bool ban_fp16)
+    : starts_(starts), ends_(ends), axes_(axes), ban_fp16_(ban_fp16) {
+  cudaEventCreate(&copy_event_);
+  cudaStreamCreate(&copy_stream_);
+}
+
+SlicePluginDynamic::SlicePluginDynamic(void const *serialData,
+                                       size_t serialLength) {
+  DeserializeValue(&serialData, &serialLength, &starts_);
+  DeserializeValue(&serialData, &serialLength, &ends_);
+  DeserializeValue(&serialData, &serialLength, &axes_);
+  DeserializeValue(&serialData, &serialLength, &ban_fp16_);
+  cudaEventCreate(&copy_event_);
+  cudaStreamCreate(&copy_stream_);
+}
+
+void SlicePluginDynamic::destroy() {
+  cudaStreamDestroy(copy_stream_);
+  cudaEventDestroy(copy_event_);
+  cudaFree(offset_temp_data_);
+  delete this;
+}
+
 int SlicePluginDynamic::initialize() { return 0; }
 
-size_t SlicePluginDynamic::getSerializationSize() const { return 0; }
+size_t SlicePluginDynamic::getSerializationSize() const {
+  size_t size = SerializedSize(starts_) + SerializedSize(ends_) +
+                SerializedSize(axes_) + SerializedSize(ban_fp16_);
 
-void SlicePluginDynamic::serialize(void *buffer) const {}
+  return size;
+}
+
+void SlicePluginDynamic::serialize(void *buffer) const {
+  SerializeValue(&buffer, starts_);
+  SerializeValue(&buffer, ends_);
+  SerializeValue(&buffer, axes_);
+  SerializeValue(&buffer, ban_fp16_);
+}
 
 nvinfer1::DimsExprs SlicePluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
@@ -136,9 +323,9 @@ int SlicePluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
   std::vector<int> offsets;
   std::vector<int> extends;
 
-  offsets.reserve(num_dims);
-  extends.reserve(num_dims);
-  seg_offsets.reserve(num_dims);
+  offsets.resize(num_dims);
+  extends.resize(num_dims);
+  seg_offsets.resize(num_dims);
 
   seg_offsets[num_dims - 1] = 1;
   for (int i = num_dims - 2; i >= 0; i--) {
@@ -160,16 +347,16 @@ int SlicePluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
     offset_info.push_back(seg_offsets[i]);
   }
 
-  framework::Tensor offset_temp_tensor;
+  if (offset_temp_data_ == nullptr) {
+    cudaMalloc(&offset_temp_data_, 3 * num_dims * sizeof(int));
+  }
 
-  int device_id;
-  cudaGetDevice(&device_id);
-  offset_temp_tensor.Resize({3 * num_dims});
-  auto *offset_temp_data =
-      offset_temp_tensor.mutable_data<int>(platform::CUDAPlace(device_id));
+  cudaMemcpyAsync(offset_temp_data_, offset_info.data(),
+                  sizeof(int) * 3 * num_dims, cudaMemcpyHostToDevice,
+                  copy_stream_);
 
-  cudaMemcpyAsync(offset_temp_data, offset_info.data(),
-                  sizeof(int) * 3 * num_dims, cudaMemcpyHostToDevice, stream);
+  cudaEventRecord(copy_event_, copy_stream_);
+  cudaStreamWaitEvent(stream, copy_event_, 0);
 
   int threads = 256;
   int blocks = (out_num + threads - 1) / threads;
@@ -178,13 +365,13 @@ int SlicePluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
     const float *input1 = static_cast<const float *>(inputs[0]);
     float *output = static_cast<float *>(outputs[0]);
     SliceKernel<float><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>(
-        out_num, num_dims, input1, offset_temp_data, output);
+        out_num, num_dims, input1, offset_temp_data_, output);
   } else if (input_type == nvinfer1::DataType::kHALF) {
 #ifdef SUPPORTS_CUDA_FP16
     const half *input1 = static_cast<const half *>(inputs[0]);
     half *output = static_cast<half *>(outputs[0]);
     SliceKernel<half><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>(
-        out_num, num_dims, input1, offset_temp_data, output);
+        out_num, num_dims, input1, offset_temp_data_, output);
 #else
     PADDLE_THROW(platform::errors::Fatal(
         "The cuda archs you specific should greater than 600."));
diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
index 13d86df131f6fff58dc896d802c8f3ad959b30bc..e36a270f05d9fee497fa1a033ed16faf08c08225 100644
--- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
@@ -26,17 +26,56 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
+class SlicePlugin : public PluginTensorRT {
+ public:
+  explicit SlicePlugin(std::vector<int> starts, std::vector<int> ends,
+                       std::vector<int> axes, bool ban_fp16);
+
+  // It was used for tensorrt deserialization.
+  // It should not be called by users.
+  SlicePlugin(void const* serial_data, size_t serial_length);
+  ~SlicePlugin();
+  SlicePlugin* clone() const override;
+
+  const char* getPluginType() const override { return "slice_plugin"; }
+  int getNbOutputs() const override { return 1; }
+  int initialize() override { return 0; }
+  bool supportsFormat(nvinfer1::DataType type,
+                      nvinfer1::PluginFormat format) const override;
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                     int nb_input_dims) override;
+  int enqueue(int batch_size, const void* const* inputs, void** outputs,
+              void* workspace, cudaStream_t stream) override;
+
+ protected:
+  size_t getSerializationSize() override;
+
+  // TRT will call this func  to serialize the configuration of TRT
+  // It should not be called by users.
+  void serialize(void* buffer) override;
+
+ private:
+  std::vector<int> starts_;
+  std::vector<int> ends_;
+  std::vector<int> axes_;
+  bool ban_fp16_{false};
+  int* offset_temp_data_{nullptr};
+  cudaEvent_t copy_event_;
+  cudaStream_t copy_stream_;
+};
+
 #if IS_TRT_VERSION_GE(6000)
 class SlicePluginDynamic : public DynamicPluginTensorRT {
  public:
   explicit SlicePluginDynamic(std::vector<int> starts, std::vector<int> ends,
-                              std::vector<int> axes, bool ban_fp16)
-      : starts_(starts), ends_(ends), axes_(axes), ban_fp16_(ban_fp16) {}
-  SlicePluginDynamic(void const* serialData, size_t serialLength) {}
+                              std::vector<int> axes, bool ban_fp16);
+
   nvinfer1::IPluginV2DynamicExt* clone() const override {
     return new SlicePluginDynamic(starts_, ends_, axes_, ban_fp16_);
   }
 
+  SlicePluginDynamic(void const* serialData, size_t serialLength);
+
   const char* getPluginType() const override { return "slice_plugin"; }
   int getNbOutputs() const override { return 1; }
   int initialize() override;
@@ -72,15 +111,54 @@ class SlicePluginDynamic : public DynamicPluginTensorRT {
                                        const nvinfer1::DataType* inputTypes,
                                        int nbInputs) const override;
 
-  void destroy() override { delete this; }
+  void destroy() override;
 
  private:
   std::vector<int> starts_;
   std::vector<int> ends_;
   std::vector<int> axes_;
-
   bool ban_fp16_{false};
+  int* offset_temp_data_{nullptr};
+  cudaEvent_t copy_event_;
+  cudaStream_t copy_stream_;
 };
+
+class SlicePluginV2Creator : public nvinfer1::IPluginCreator {
+ public:
+  SlicePluginV2Creator() {}
+  const char* getPluginName() const override { return "slice_plugin"; }
+
+  const char* getPluginVersion() const override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serialData,
+                                         size_t serialLength) override {
+    auto plugin = new SlicePluginDynamic(serialData, serialLength);
+    return plugin;
+  }
+
+  void setPluginNamespace(const char* libNamespace) override {
+    namespace_ = libNamespace;
+  }
+
+  const char* getPluginNamespace() const override { return namespace_.c_str(); }
+
+ private:
+  std::string namespace_;
+  nvinfer1::PluginFieldCollection field_collection_;
+};
+
+REGISTER_TRT_PLUGIN_V2(SlicePluginV2Creator);
+
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index a03dd45db0f80487cb4c2e6b68f94944e8558ae4..72962c733ecf6a7bc6871fd3a5c65d6156b084d4 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -74,7 +74,9 @@ TEST_F(TensorRTEngineTest, add_layer) {
                                   nvinfer1::DimsCHW{1, 1, 1});
   auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size,
                                         weight.get(), bias.get());
-  PADDLE_ENFORCE(fc_layer != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(fc_layer,
+                          platform::errors::InvalidArgument(
+                              "TRT fully connected layer building failed."));
 
   engine_->DeclareOutput(fc_layer, 0, "y");
   LOG(INFO) << "freeze network";
@@ -116,7 +118,9 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
                                   nvinfer1::DimsCHW{1, 2, 1});
   auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2,
                                         weight.get(), bias.get());
-  PADDLE_ENFORCE(fc_layer != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(fc_layer,
+                          platform::errors::InvalidArgument(
+                              "TRT fully connected layer building failed."));
 
   engine_->DeclareOutput(fc_layer, 0, "y");
   engine_->FreezeNetwork();
@@ -160,7 +164,9 @@ TEST_F(TensorRTEngineTest, test_conv2d) {
   auto *conv_layer =
       TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3},
                            weight.get(), bias.get());
-  PADDLE_ENFORCE(conv_layer != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(conv_layer,
+                          platform::errors::InvalidArgument(
+                              "TRT convolution layer building failed."));
   conv_layer->setStride(nvinfer1::DimsHW{1, 1});
   conv_layer->setPadding(nvinfer1::DimsHW{1, 1});
 
@@ -199,7 +205,9 @@ TEST_F(TensorRTEngineTest, test_pool2d) {
   auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *x, pool_t,
                                           nvinfer1::DimsHW{2, 2});
 
-  PADDLE_ENFORCE(pool_layer != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(
+      pool_layer,
+      platform::errors::InvalidArgument("TRT pooling layer building failed."));
   pool_layer->setStride(nvinfer1::DimsHW{1, 1});
   pool_layer->setPadding(nvinfer1::DimsHW{0, 0});
 
diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
index 34b7072b2ee688c2ac01229ff5d3a234af3680b5..743f7740e5faaa1991172ef2a8d1cd38ad47fab5 100644
--- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
+++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
@@ -83,9 +83,8 @@ bool TRTInt8Calibrator::setBatch(
           engine_name_, it.first));
     }
     const auto& d = dataptr->second;
-    PADDLE_ENFORCE(
-        cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice),
-        "Fail to cudaMemcpy %s for %s", engine_name_, it.first);
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice));
   }
 
   data_is_set_ = true;
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 6dd13d32e6e25f1657f351ff3a54562435b098f3..a1b43de469542a1612600a5a22c19c8745179afb 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -342,9 +342,9 @@ if(WITH_MKLDNN)
   ### Lexcial analysis GRU model
   set(GRU_PATH "${INFERENCE_DEMO_INSTALL_DIR}/gru")
   download_GRU_data("${GRU_PATH}" "GRU_eval_data.tar.gz")
-  download_GRU_data("${GRU_PATH}" "GRU_eval_model.tar.gz")
+  download_GRU_data("${GRU_PATH}" "GRU_eval_model_v2.tar.gz")
   set(GRU_DATA_PATH "${GRU_PATH}/GRU_eval_data.bin")
-  set(GRU_MODEL_PATH "${GRU_PATH}/GRU_eval_model")
+  set(GRU_MODEL_PATH "${GRU_PATH}/GRU_eval_model_v2")
   set(LEXICAL_TEST_APP "test_analyzer_lexical_analysis")
   set(LEXICAL_TEST_APP_SRC "analyzer_lexical_analysis_gru_tester.cc")
 
@@ -363,9 +363,12 @@ if(WITH_MKLDNN)
   inference_analysis_api_test_build(${QUANT_IMG_CLASS_TEST_APP} ${QUANT_IMG_CLASS_TEST_APP_SRC})
 
   # MobileNetV1 FP32 vs. Quant INT8
+  # The FP32 model should already be downloaded for slim Quant unit tests on Linux
   set(QUANT2_MobileNetV1_MODEL_DIR "${QUANT_DATA_DIR}/MobileNetV1_quant2")
   set(QUANT2_INT8_MobileNetV1_MODEL_DIR "${QUANT_DATA_DIR}/MobileNetV1_quant2_int8")
-  download_quant_data(${QUANT2_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf.tar.gz")
+  if(NOT LINUX)
+      download_quant_data(${QUANT2_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf.tar.gz")
+  endif(NOT LINUX)
   download_quant_data(${QUANT2_INT8_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf_int8.tar.gz")
   inference_analysis_api_quant_test_run(test_analyzer_quant_performance_benchmark ${QUANT_IMG_CLASS_TEST_APP} ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float ${QUANT2_INT8_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf_int8 ${IMAGENET_DATA_PATH})
 
@@ -477,10 +480,9 @@ if(WITH_GPU AND TENSORRT_FOUND)
         inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
     endif()
 
-    # disable test_trt_dynamic_shape_ernie_ser_deser temporary
-    #inference_analysis_test(test_trt_dynamic_shape_ernie_ser_deser SRCS trt_dynamic_shape_ernie_deserialize_test.cc
-    #        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
-    #        ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
+    inference_analysis_test(test_trt_dynamic_shape_ernie_ser_deser SRCS trt_dynamic_shape_ernie_deserialize_test.cc
+            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
+            ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
 
 endif()
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
index f956c34f23ac7cc6ca06b9fcf411d0f2e9b29c54..2570325c24abcbb4bd459944480d3279f24fab1f 100644
--- a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
@@ -245,8 +245,14 @@ TEST(Analyzer_bert, transfer_scope_cache) {
   // Since paddle::framework::global_transfer_scope_cache() and
   // paddle::framework::global_transfer_data_cache() are thread_local,
   // their pointer should be different among different thread id.
-  PADDLE_ENFORCE(global_transfer_scope_cache.size(), threads_num);
-  PADDLE_ENFORCE(global_transfer_data_cache.size(), threads_num);
+  PADDLE_ENFORCE_EQ(
+      global_transfer_scope_cache.size(), threads_num,
+      paddle::platform::errors::Fatal(
+          "The size of scope cache is not equal to thread number."));
+  PADDLE_ENFORCE_EQ(
+      global_transfer_data_cache.size(), threads_num,
+      paddle::platform::errors::Fatal(
+          "The size of data cache is not equal to thread number."));
 }
 
 }  // namespace inference
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc
index 0bc67aff7af1be9f34ffa2bb71c25d2964a62521..a9c24c4503f9f1b803c0d9fcde21199ef4089c41 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc
@@ -69,11 +69,13 @@ void PD_run() {
   PD_DeletePaddleTensor(input);
   int size;
   const int* out_shape = PD_GetPaddleTensorShape(out_data, &size);
-  CHECK(size == 2) << "The Output shape's size is NOT match.";
+  PADDLE_ENFORCE_EQ(size, 2, paddle::platform::errors::InvalidArgument(
+                                 "The Output shape's size is NOT match."));
   std::vector<int> ref_outshape_size({9, 6});
   for (int i = 0; i < 2; ++i) {
-    CHECK(out_shape[i] == ref_outshape_size[i])
-        << "The Output's shape is NOT match.";
+    PADDLE_ENFORCE_EQ(out_shape[i], ref_outshape_size[i],
+                      paddle::platform::errors::InvalidArgument(
+                          "The Output shape's size is NOT match."));
   }
   PD_DeletePaddleBuf(buf);
 }
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
index d76799a679cbf27700c6d9af4f2e2e50c5e33e35..fd20581123c10f8c569e7765c7a0bf17ddd1d0b9 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
@@ -36,9 +36,9 @@ void zero_copy_run() {
   PD_SwitchIrDebug(config, true);
   PD_SetModel(config, prog_file.c_str(), params_file.c_str());
   bool use_feed_fetch = PD_UseFeedFetchOpsEnabled(config);
-  CHECK(!use_feed_fetch) << "NO";
+  EXPECT_FALSE(use_feed_fetch);
   bool specify_input_names = PD_SpecifyInputName(config);
-  CHECK(specify_input_names) << "NO";
+  EXPECT_TRUE(specify_input_names);
 
   const int batch_size = 1;
   const int channels = 3;
@@ -85,13 +85,13 @@ TEST(PD_AnalysisConfig, profile_mkldnn) {
   PD_SwitchIrDebug(config, true);
   PD_EnableMKLDNN(config);
   bool mkldnn_enable = PD_MkldnnEnabled(config);
-  CHECK(mkldnn_enable) << "NO";
+  EXPECT_TRUE(mkldnn_enable);
   PD_EnableMkldnnQuantizer(config);
   bool quantizer_enable = PD_MkldnnQuantizerEnabled(config);
-  CHECK(quantizer_enable) << "NO";
+  EXPECT_TRUE(quantizer_enable);
   PD_EnableMkldnnBfloat16(config);
   bool bfloat16_enable = PD_MkldnnBfloat16Enabled(config);
-  CHECK(bfloat16_enable) << "NO";
+  EXPECT_TRUE(bfloat16_enable);
   PD_SetMkldnnCacheCapacity(config, 0);
   PD_SetModel(config, prog_file.c_str(), params_file.c_str());
   PD_DeleteAnalysisConfig(config);
diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
index 00a475b6047e8215264c664dd3c775b9687eb0ff..d61c28c30d203acf4dd48e1461a881d61f8ec263 100644
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -126,7 +126,9 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   std::string turn_mask_pre = "turn_mask_";
 
   auto one_batch = data->NextBatch();
-  PADDLE_ENFORCE(!one_batch.response.empty());
+  PADDLE_ENFORCE(
+      !one_batch.response.empty(),
+      paddle::platform::errors::Fatal("The response of one batch is empty."));
   int size = one_batch.response[0].size();
   CHECK_EQ(size, kMaxTurnLen);
   // turn tensor assignment
@@ -214,11 +216,17 @@ void profile(bool use_mkldnn = false) {
                  input_slots_all, &outputs, FLAGS_num_threads);
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of outputs should be greater than 0."));
     auto output = outputs.back();
-    PADDLE_ENFORCE_GT(output.size(), 0);
+    PADDLE_ENFORCE_GT(output.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     size_t size = GetSize(output[0]);
-    PADDLE_ENFORCE_GT(size, 0);
+    PADDLE_ENFORCE_GT(size, 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     float *result = static_cast<float *>(output[0].data.data());
     for (size_t i = 0; i < size; i++) {
       EXPECT_NEAR(result[i], result_data[i], 1e-3);
diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
index 7f06a3b9023ba3e907c9731d576f014a3e451113..91a3233b9851f1def7717d04c4c9df5275a805ee 100644
--- a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
@@ -146,8 +146,9 @@ std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
   auto iterations = test_data.size();
   PADDLE_ENFORCE_LE(
       static_cast<size_t>(num_images), iterations * test_data_batch_size,
-      "The requested quantization warmup data size " +
-          std::to_string(num_images) + " is bigger than all test data size.");
+      paddle::platform::errors::Fatal(
+          "The requested quantization warmup data size " +
+          std::to_string(num_images) + " is bigger than all test data size."));
 
   PaddleTensor images;
   images.name = "image";
@@ -237,8 +238,9 @@ std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
   }
   PADDLE_ENFORCE_EQ(
       static_cast<size_t>(num_objects), static_cast<size_t>(objects_accum),
-      "The requested num of objects " + std::to_string(num_objects) +
-          " is the same as objects_accum.");
+      paddle::platform::errors::Fatal("The requested num of objects " +
+                                      std::to_string(num_objects) +
+                                      " is the same as objects_accum."));
 
   auto warmup_data = std::make_shared<std::vector<PaddleTensor>>(4);
   (*warmup_data)[0] = std::move(images);
diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
index 142905dcd8d9964d93d0c5f7444823eef2b84900..bd3a1d737afb1ba230015fbd602c493f33952ffb 100644
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -98,7 +98,9 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   input_tensor.name = "word";
   input_tensor.dtype = PaddleDType::INT64;
   TensorAssignData<int64_t>(&input_tensor, {one_batch.data}, one_batch.lod);
-  PADDLE_ENFORCE_EQ(batch_size, static_cast<int>(one_batch.lod.size() - 1));
+  PADDLE_ENFORCE_EQ(
+      batch_size, static_cast<int>(one_batch.lod.size() - 1),
+      paddle::platform::errors::Fatal("The lod size of one batch is invaild."));
   input_slots->assign({input_tensor});
 }
 
@@ -137,12 +139,17 @@ TEST(Analyzer_LAC, profile) {
         24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, 25, 25, 25, 25,
         44, 24, 25, 25, 25, 36, 42, 43, 44, 14, 15, 44, 14, 15, 44, 14,
         15, 44, 38, 39, 14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     auto output = outputs.back();
-    PADDLE_ENFORCE_EQ(output.size(), 1UL);
+    PADDLE_ENFORCE_EQ(output.size(), 1UL,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be equal to 1."));
     size_t size = GetSize(output[0]);
     size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
-    PADDLE_ENFORCE_GE(size, batch1_size);
+    PADDLE_ENFORCE_GE(size, batch1_size, paddle::platform::errors::Fatal(
+                                             "The size of batch is invaild."));
     int64_t *pdata = static_cast<int64_t *>(output[0].data.data());
     for (size_t i = 0; i < batch1_size; ++i) {
       EXPECT_EQ(pdata[i], lac_ref_data[i]);
diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
index 2a862b1395c222cf6d23216c9d4cf9196ffb519c..50a68361d536f5aab3ed2a6bafe60f2438a9c129 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -117,11 +117,17 @@ void profile(bool memory_load = false) {
     // the first inference result
     const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
                                            48, 39, 38, 16, 25};
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     auto output = outputs.back();
-    PADDLE_ENFORCE_EQ(output.size(), 1UL);
+    PADDLE_ENFORCE_EQ(output.size(), 1UL,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be equal to 1."));
     size_t size = GetSize(output[0]);
-    PADDLE_ENFORCE_GT(size, 0);
+    PADDLE_ENFORCE_GT(size, 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     int64_t *result = static_cast<int64_t *>(output[0].data.data());
     for (size_t i = 0; i < std::min<size_t>(11, size); i++) {
       EXPECT_EQ(result[i], chinese_ner_result_data[i]);
diff --git a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
index 06a8e01b10c6eb70fe2cbac19725d96281863c29..bb1f0e8cd6334bab83973fb7d314f7017edd9e90 100644
--- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
@@ -136,11 +136,17 @@ TEST(Analyzer_Pyramid_DNN, profile) {
                  input_slots_all, &outputs, FLAGS_num_threads);
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data && !FLAGS_zero_copy) {
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     auto output = outputs.back();
-    PADDLE_ENFORCE_EQ(output.size(), 1UL);
+    PADDLE_ENFORCE_EQ(output.size(), 1UL,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be equal to 1."));
     size_t size = GetSize(output[0]);
-    PADDLE_ENFORCE_GT(size, 0);
+    PADDLE_ENFORCE_GT(size, 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     float *result = static_cast<float *>(output[0].data.data());
     // output is probability, which is in (0, 1).
     for (size_t i = 0; i < size; i++) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
index 9ccbf58cbd2bbaab9b1a132c27e50356e1a5df37..34a0a5f398d7fee0f8e44f0ad59ff9711263b575 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
@@ -135,11 +135,17 @@ TEST(Analyzer_rnn2, profile) {
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     // the first inference result
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     auto output = outputs.back();
-    PADDLE_ENFORCE_GT(output.size(), 0);
+    PADDLE_ENFORCE_GT(output.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     size_t size = GetSize(output[0]);
-    PADDLE_ENFORCE_GT(size, 0);
+    PADDLE_ENFORCE_GT(size, 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     float *result = static_cast<float *>(output[0].data.data());
     for (size_t i = 0; i < size; i++) {
       EXPECT_NEAR(result[i], result_data[i], 1e-3);
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
index e3f8b835f78371170aaf107e1b2d1ca41b300e56..978aaf1c6a32d5b4ec8f2d06b8873af892705da5 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -47,7 +47,8 @@ struct DataRecord {
       num_lines++;
       std::vector<std::string> data;
       split(line, '\t', &data);
-      PADDLE_ENFORCE(data.size() >= 4);
+      PADDLE_ENFORCE_GT(data.size(), 4, paddle::platform::errors::Fatal(
+                                            "The size of data is invaild."));
       // load title1 data
       std::vector<int64_t> title1_data;
       split_to_int64(data[0], ' ', &title1_data);
@@ -120,11 +121,17 @@ TEST(Analyzer_seq_conv1, profile) {
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     // the first inference result
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     auto output = outputs.back();
-    PADDLE_ENFORCE_EQ(output.size(), 1UL);
+    PADDLE_ENFORCE_EQ(output.size(), 1UL,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be equal to 0."));
     size_t size = GetSize(output[0]);
-    PADDLE_ENFORCE_GT(size, 0);
+    PADDLE_ENFORCE_GT(size, 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     float *result = static_cast<float *>(output[0].data.data());
     // output is probability, which is in (0, 1).
     for (size_t i = 0; i < size; i++) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
index 56f706ae56bda8b06eba5dd9e080552aa9785c6e..9f1556cdb871aa3e5bbe613aa98299c162661c42 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@@ -56,20 +56,26 @@ struct DataRecord {
       std::vector<float> slot_data;
       split_to_float(data[1], ' ', &slot_data);
       std::string name = data[0];
-      PADDLE_ENFORCE_EQ(slot_data.size() % 11, 0UL,
-                        "line %d, %s should be divisible", num_lines, name);
+      PADDLE_ENFORCE_EQ(
+          slot_data.size() % 11, 0UL,
+          paddle::platform::errors::Fatal("line %d, %s should be divisible",
+                                          num_lines, name));
       datasets[name].emplace_back(std::move(slot_data));
     }
     num_samples = num_lines / num_slots;
-    PADDLE_ENFORCE_EQ(num_samples * num_slots, static_cast<size_t>(num_lines),
-                      "num samples should be divisible");
-    PADDLE_ENFORCE_GT(num_samples, 0UL);
+    PADDLE_ENFORCE_EQ(
+        num_samples * num_slots, static_cast<size_t>(num_lines),
+        paddle::platform::errors::Fatal("num samples should be divisible"));
+    PADDLE_ENFORCE_GT(num_samples, 0UL,
+                      paddle::platform::errors::Fatal(
+                          "The num of samples should be greater than 0."));
   }
 
   void Prepare(int bs) {
     for (auto it = datasets.begin(); it != datasets.end(); ++it) {
-      PADDLE_ENFORCE_EQ(it->second.size(), num_samples,
-                        "size of each slot should be equal");
+      PADDLE_ENFORCE_EQ(
+          it->second.size(), num_samples,
+          paddle::platform::errors::Fatal("size of each slot should be equal"));
     }
     size_t num_batches = num_samples / bs;
     EXPECT_GT(num_batches, 0UL);
@@ -90,8 +96,10 @@ struct DataRecord {
           std::copy(datas[id].begin(), datas[id].end(),
                     std::back_inserter(slot.data[k]));
           size_t len = datas[id].size() / 11;
-          PADDLE_ENFORCE_EQ(len * 11, datas[id].size(),
-                            "%s %d size should be divisible", slot.name, id);
+          PADDLE_ENFORCE_EQ(
+              len * 11, datas[id].size(),
+              paddle::platform::errors::Fatal("%s %d size should be divisible",
+                                              slot.name, id));
           lod[k + 1] = lod[k] + len;
         }
         slot.shape.assign({static_cast<int>(lod[bs]), 11});
diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
index 78e500b2ed530d5a1dce8a7927538fdd0bbb6907..ae38bcbc20a9f44eb6ef5c313b318dec38a30550 100644
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -22,7 +22,9 @@ struct DataReader {
       : file(new std::ifstream(path)) {}
 
   bool NextBatch(std::vector<PaddleTensor> *input, int batch_size) {
-    PADDLE_ENFORCE_EQ(batch_size, 1);
+    PADDLE_ENFORCE_EQ(batch_size, 1,
+                      paddle::platform::errors::Fatal(
+                          "The size of batch should be equal to 1."));
     std::string line;
     PaddleTensor tensor;
     tensor.dtype = PaddleDType::INT64;
@@ -81,7 +83,9 @@ TEST(Analyzer_Text_Classification, profile) {
 
   if (FLAGS_num_threads == 1) {
     // Get output
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     LOG(INFO) << "get outputs " << outputs.back().size();
     for (auto &output : outputs.back()) {
       LOG(INFO) << "output.shape: " << to_string(output.shape);
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index 65755b7b15ad54e38e398a82db41a0b9d8fc59e3..a2ced21a9ac9ad10c2b067a60597eee9fdff9eeb 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -59,7 +59,9 @@ void SetConfig(AnalysisConfig *cfg) {
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
-  PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
+  PADDLE_ENFORCE_EQ(
+      FLAGS_test_all_data, 0,
+      paddle::platform::errors::Fatal("Only have single batch of data."));
   std::string line;
   std::ifstream file(FLAGS_infer_data);
   std::getline(file, line);
@@ -99,7 +101,9 @@ void profile(bool use_mkldnn = false) {
     auto refer = ProcessALine(line);
     file.close();
 
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     auto &output = outputs.back().front();
     size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
     CHECK_EQ(numel, refer.data.size());
diff --git a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
index 31701c59ec33dfced5745f7f16d8f00ffce462ef..9ae073e9e5b142254b32396e0355f59ae1826909 100644
--- a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
@@ -27,7 +27,7 @@ TEST(AnalysisPredictor, use_gpu) {
   AnalysisConfig config;
   config.EnableUseGpu(100, 0);
   config.SetModel(model_dir + "/model", model_dir + "/params");
-  config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
+  config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32, true);
 
   std::vector<PaddleTensor> inputs;
   auto predictor = CreatePaddlePredictor(config);
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
index 524e08891f4e90d8a322822e26d75689526d30f5..685f7b6600e4d73731860135469a3072d8ce7f9a 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
@@ -12,15 +12,33 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <dirent.h>
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include <unistd.h>
 
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
 namespace inference {
 
+int DeleteCache(std::string path) {
+  DIR* dir = opendir(path.c_str());
+  if (dir == NULL) return 0;
+  struct dirent* ptr;
+  while ((ptr = readdir(dir)) != NULL) {
+    if (std::strcmp(ptr->d_name, ".") == 0 ||
+        std::strcmp(ptr->d_name, "..") == 0) {
+      continue;
+    } else if (ptr->d_type == 8) {
+      std::string file_rm = path + "/" + ptr->d_name;
+      return remove(file_rm.c_str());
+    }
+  }
+  return 0;
+}
+
 void run(const AnalysisConfig& config, std::vector<float>* out_data) {
   auto predictor = CreatePaddlePredictor(config);
   auto input_names = predictor->GetInputNames();
@@ -86,6 +104,11 @@ void run(const AnalysisConfig& config, std::vector<float>* out_data) {
 void trt_ernie(bool with_fp16, std::vector<float> result) {
   AnalysisConfig config;
   std::string model_dir = FLAGS_infer_model;
+  // Delete serialization cache to perform serialization first rather than
+  // deserialization.
+  std::string opt_cache_dir = FLAGS_infer_model + "/_opt_cache";
+  DeleteCache(opt_cache_dir);
+
   SetConfig(&config, model_dir, true /* use_gpu */);
 
   config.SwitchUseFeedFetchOps(false);
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index 7183cbac71562bfe4092bf78270096996b74c525..1457f5337e3ed05b74d247d65e2f6b2f7f6735d3 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -21,6 +21,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -162,7 +163,8 @@ void TestInference(const std::string& dirname,
     //   int device_id = place.GetDeviceId();
     paddle::platform::SetDeviceId(0);
 #else
-    PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "'CUDAPlace' is not supported in CPU only device."));
 #endif
   }
 
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
index eb24ba84c886e3393cf36b6f764d7b33e76defeb..59c14103ca67dbf325928a9aee73d903d7d9e9e3 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
@@ -16,6 +16,7 @@
 #include <random>
 #include <thread>  // NOLINT
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
@@ -41,12 +42,14 @@ TEST(BestFitAllocator, concurrent_cuda) {
   LockedAllocator concurrent_allocator(
       std::unique_ptr<Allocator>(new BestFitAllocator(cuda_allocation.get())));
 
+  platform::CUDAPlace gpu(0);
+  platform::CUDADeviceContext dev_ctx(gpu);
+
   auto th_main = [&](std::random_device::result_type seed) {
     std::default_random_engine engine(seed);
     std::uniform_int_distribution<size_t> dist(1U, 1024U);
-    platform::CUDAPlace gpu(0);
-    platform::CUDADeviceContext dev_ctx(gpu);
     std::array<size_t, 1024> buf;
+
     for (size_t i = 0; i < 128; ++i) {
       size_t allocate_size = dist(engine);
 
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 6e8ff52ed4a8846f5f6060e10cfd9bec22308e9e..f0a04d850dff01e0776e96bbe518cde2ce8bb88b 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -45,7 +45,9 @@ endif()
 SET(OP_HEADER_DEPS xxhash executor)
 
 if (WITH_GPU)
-    SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
+    if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+        SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
+    endif()
 endif()
 
 SET(OP_PREFETCH_DEPS "")
diff --git a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc b/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc
deleted file mode 100644
index 7f0ca1493f712f7f4809a56bf6a23f8757f94c2d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h"
-
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-class AmpCheckFiniteAndScaleOp : public framework::OperatorWithKernel {
- public:
-  AmpCheckFiniteAndScaleOp(const std::string &type,
-                           const framework::VariableNameMap &inputs,
-                           const framework::VariableNameMap &outputs,
-                           const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X",
-                   "amp_check_finite_and_unscale");
-    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
-                   "amp_check_finite_and_unscale");
-    PADDLE_ENFORCE_EQ(
-        ctx->Inputs("X").size(), ctx->Outputs("Out").size(),
-        platform::errors::InvalidArgument(
-            "The input(X) and output(Out) should have same size in "
-            "Operator(amp_check_finite_and_unscale), size of input(X) is %d "
-            "and size of output(Out) is %d.",
-            ctx->Inputs("X").size(), ctx->Outputs("Out").size()));
-    auto x_dims = ctx->GetInputsDim("X");
-    ctx->SetOutputsDim("Out", x_dims);
-    ctx->SetOutputDim("FoundInfinite", {1});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
-  }
-};
-
-class AmpCheckFiniteAndScaleOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "X",
-        "(Tensors) The input tensors of amp_check_finite_and_scale operator.")
-        .AsDuplicable();
-    AddInput("Scale",
-             "(Tensor) 1-dim tensor, the scale of amp_check_finite_and_scale "
-             "operator.");
-    AddOutput("Out",
-              "(Tensors) The scaled output tensor of "
-              "amp_check_finite_and_unscale operator.")
-        .AsDuplicable();
-    AddOutput("FoundInfinite",
-              "(Tensor) 1-dim tensor, contains a bool scalar, which indicates "
-              "if there there is infinite or nan item in input X.");
-    AddComment(R"DOC(
-amp_check_finite_and_scale operator.
-Check if input X contains all finite data, if yes, scale it by input Scale.
-
-$$Out = X * scale$$
-
-If any tensor in X contains Inf or Nan, the Out will generate a indicator.
-FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of 
-Out should not be used, and its data may not be deterministic. 
-Otherwise, FoundInfinite will be 0 (False).
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    amp_check_finite_and_scale, ops::AmpCheckFiniteAndScaleOp,
-    ops::AmpCheckFiniteAndScaleOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(
-    amp_check_finite_and_scale,
-    ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CPUDeviceContext,
-                                      float>,
-    ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CPUDeviceContext,
-                                      double>);
diff --git a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h b/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h
deleted file mode 100644
index 6c2c4eb8a615c4c04a98601c25b5de43b4262e6b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/isfinite_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AmpCheckFiniteAndScaleKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    const auto xs = ctx.MultiInput<framework::Tensor>("X");
-    const auto* scale = ctx.Input<framework::Tensor>("Scale");
-    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
-
-    const T* scale_data = scale->data<T>();
-    bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
-
-    *found_inf_data = false;
-    framework::Tensor is_finite =
-        ctx.AllocateTmpTensor<bool, DeviceContext>({1}, dev_ctx);
-    bool* is_finite_data = is_finite.template data<bool>();
-
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    for (size_t i = 0; i < xs.size(); ++i) {
-      const auto* x = xs[i];
-      auto* out = outs[i];
-      out->mutable_data<T>(dev_ctx.GetPlace());
-      if (!(*found_inf_data)) {
-        framework::TensorIsfinite(*x, &is_finite);
-        if (*is_finite_data) {
-          auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-          auto eigen_in = framework::EigenVector<T>::Flatten(*x);
-          eigen_out.device(dev) = (*scale_data) * eigen_in;
-        } else {
-          *found_inf_data = true;
-          break;
-        }
-      }
-    }
-    return;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..51c659d5db1c33d5e2db261b998a0673f5e766cb
--- /dev/null
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
@@ -0,0 +1,141 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
+#include "paddle/fluid/framework/tensor_util.h"
+
+namespace paddle {
+namespace operators {
+
+class CheckFiniteAndUnscaleOp : public framework::OperatorWithKernel {
+ public:
+  CheckFiniteAndUnscaleOp(const std::string& type,
+                          const framework::VariableNameMap& inputs,
+                          const framework::VariableNameMap& outputs,
+                          const framework::AttributeMap& attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X",
+                   "check_finite_and_unscale");
+    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
+                   "check_finite_and_unscale");
+    PADDLE_ENFORCE_EQ(
+        ctx->Inputs("X").size(), ctx->Outputs("Out").size(),
+        platform::errors::InvalidArgument(
+            "The input(X) and output(Out) should have same size in "
+            "Operator(check_finite_and_unscale), size of input(X) is %d "
+            "and size of output(Out) is %d.",
+            ctx->Inputs("X").size(), ctx->Outputs("Out").size()));
+    auto x_dims = ctx->GetInputsDim("X");
+    ctx->SetOutputsDim("Out", x_dims);
+    ctx->SetOutputDim("FoundInfinite", {1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class CheckFiniteAndUnscaleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "X",
+        "(Tensors) The input tensors of check_finite_and_unscale operator.")
+        .AsDuplicable();
+    AddInput("Scale",
+             "(Tensor) 1-dim tensor, the scale of check_finite_and_unscale "
+             "operator.");
+    AddOutput("Out",
+              "(Tensors) The scaled output tensor of "
+              "check_finite_and_unscale operator.")
+        .AsDuplicable();
+    AddOutput("FoundInfinite",
+              "(Tensor) 1-dim tensor, contains a bool scalar, which indicates "
+              "if there there is infinite or nan item in input X.");
+    AddComment(R"DOC(
+check_finite_and_unscale operator.
+Check if input X contains all finite data, if yes, scale it by input Scale.
+
+$$Out = X / scale$$
+
+If any tensor in X contains Inf or Nan, the Out will generate a indicator.
+FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of 
+Out should not be used, and its data may not be deterministic. 
+Otherwise, FoundInfinite will be 0 (False).
+
+)DOC");
+  }
+};
+
+template <typename T>
+class CheckFiniteAndUnscaleCpuKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+    const auto xs = ctx.MultiInput<framework::Tensor>("X");
+    const auto* scale = ctx.Input<framework::Tensor>("Scale");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
+
+    const T* scale_data = scale->data<T>();
+    bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
+
+    *found_inf_data = false;
+    framework::Tensor is_finite =
+        ctx.AllocateTmpTensor<bool, platform::CPUDeviceContext>({1}, dev_ctx);
+    bool* is_finite_data = is_finite.template data<bool>();
+
+    auto& dev = *ctx.template device_context<platform::CPUDeviceContext>()
+                     .eigen_device();
+
+    T inverse_scale = Inverse<T>(*scale_data);
+    for (size_t i = 0; i < xs.size(); ++i) {
+      const auto* x = xs[i];
+      auto* out = outs[i];
+      out->mutable_data<T>(dev_ctx.GetPlace());
+      if (!(*found_inf_data)) {
+        framework::TensorIsfinite(*x, &is_finite);
+        *found_inf_data = !(*is_finite_data);
+      }
+      auto eigen_out = framework::EigenVector<T>::Flatten(*out);
+      auto eigen_in = framework::EigenVector<T>::Flatten(*x);
+      if (!(*found_inf_data)) {
+        eigen_out.device(dev) = eigen_in * inverse_scale;
+      } else {
+        eigen_out.device(dev) = eigen_in * static_cast<T>(0);
+      }
+    }
+    return;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(
+    check_finite_and_unscale, ops::CheckFiniteAndUnscaleOp,
+    ops::CheckFiniteAndUnscaleOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(check_finite_and_unscale,
+                       ops::CheckFiniteAndUnscaleCpuKernel<float>,
+                       ops::CheckFiniteAndUnscaleCpuKernel<double>);
diff --git a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
similarity index 63%
rename from paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu
rename to paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
index ee00d7c5f4499867c2c706ddcf314c1bfae0a866..cf9df34a2467f8461c4c284b4848c54b76edf452 100644
--- a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
@@ -14,28 +14,31 @@ limitations under the License. */
 
 #include <cuda.h>
 
-#include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename T>
-__global__ void AmpCheckFiniteAndScale(const T* in, const T* scale, int num,
-                                       bool* found_inf, T* out) {
+__global__ void GpuInverse(const T* s, T* o) {
+  *o = Inverse<T>(*s);
+}
+
+template <typename T>
+__global__ void CheckFiniteAndUnscale(const T* in, const T* scale, int num,
+                                      bool* found_inf, T* out) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
 
   if (idx < num) {
     if (!isfinite(in[idx])) {
-      *found_inf = 1;
+      *found_inf = true;
     }
-    out[idx] = *found_inf ? in[idx] : in[idx] * scale[0];
+    out[idx] = *found_inf ? in[idx] : in[idx] * (*scale);
   }
 }
 
 template <typename T>
-class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
+class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
@@ -48,6 +51,12 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
     bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
     cudaMemset(found_inf_data, false, found_inf->numel() * sizeof(bool));
 
+    framework::Tensor inverse_scale =
+        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({1}, dev_ctx);
+    T* inverse_scale_v = inverse_scale.template data<T>();
+
+    GpuInverse<T><<<1, 1, 0, dev_ctx.stream()>>>(scale_data, inverse_scale_v);
+
     for (size_t i = 0; i < xs.size(); ++i) {
       const auto* x = xs[i];
       auto* out = outs[i];
@@ -55,11 +64,11 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
       T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
 
       int num = x->numel();
-      int block = 512;
+      int block = 1024;
       int grid = (num + block - 1) / block;
       VLOG(3) << "launch kernel";
-      AmpCheckFiniteAndScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
-          x_data, scale_data, num, found_inf_data, out_data);
+      CheckFiniteAndUnscale<T><<<grid, block, 0, dev_ctx.stream()>>>(
+          x_data, inverse_scale_v, num, found_inf_data, out_data);
       VLOG(3) << "finish kernel";
     }
   }
@@ -68,9 +77,6 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    amp_check_finite_and_scale,
-    ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CUDADeviceContext,
-                                      float>,
-    ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CUDADeviceContext,
-                                      double>);
+REGISTER_OP_CUDA_KERNEL(check_finite_and_unscale,
+                        ops::CheckFiniteAndUnscaleGpuKernel<float>,
+                        ops::CheckFiniteAndUnscaleGpuKernel<double>);
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.h b/paddle/fluid/operators/amp/check_finite_and_unscale_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..4fb8744d0eee3c58f2948c5a466e08c2700b4332
--- /dev/null
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/operators/isfinite_op.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+inline HOSTDEVICE T Inverse(T s) {
+  return 1.0 / s;
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cc b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fca3c531b40550952273f03f41bbc62cbff170fc
--- /dev/null
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
@@ -0,0 +1,170 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
+#include <cstring>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class UpdateLossScalingOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasInput("FoundInfinite"), "Input", "FoundInfinite",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasInput("PrevLossScaling"), "Input", "PrevLossScaling",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasInput("InGoodSteps"), "Input", "InGoodSteps",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasInput("InBadSteps"), "Input", "InBadSteps",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasOutput("LossScaling"), "Output", "LossScaling",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasOutput("OutGoodSteps"), "Output", "OutGoodSteps",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasOutput("OutBadSteps"), "Output", "OutBadSteps",
+                   "update_loss_scaling");
+    auto x_dims = ctx->GetInputsDim("X");
+    ctx->SetOutputsDim("Out", x_dims);
+    ctx->SetOutputDim("LossScaling", {1});
+    ctx->SetOutputDim("OutGoodSteps", {1});
+    ctx->SetOutputDim("OutBadSteps", {1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "PrevLossScaling"),
+        ctx.device_context());
+  }
+};
+
+class UpdateLossScalingOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensors) The input tensors of update_loss_scaling operator.")
+        .AsDuplicable();
+    AddInput("FoundInfinite",
+             "(Tensor) 1-dim tensor, contains a bool scalar, which indicates "
+             "whether there is any infinite gradient.");
+    AddInput("PrevLossScaling",
+             "(Tensor) 1-dim tensor, previous loss scaling.");
+    AddInput("InGoodSteps",
+             "(Tensor) 1-dim tensor, accumulates good steps in which all "
+             "gradients are finite.");
+    AddInput("InBadSteps",
+             "(Tensor) 1-dim tensor, accumulates bad steps in which some "
+             "gradients are infinite.");
+    AddOutput("Out",
+              "(Tensors) The output tensor of update_loss_scaling operator.")
+        .AsDuplicable();
+    AddOutput("LossScaling", "(Tensor) 1-dim tensor, updated loss scaling.");
+    AddOutput("OutGoodSteps", "(Tensor) 1-dim tensor, pdated good steps.");
+    AddOutput("OutBadSteps", "(Tensor) 1-dim tensor, updated bad steps.");
+    AddAttr<int>("incr_every_n_steps",
+                 "A value represents increasing loss scaling every n "
+                 "consecutive steps with finite gradients.");
+    AddAttr<int>("decr_every_n_nan_or_inf",
+                 "A value represents decreasing loss scaling every n "
+                 "accumulated steps with nan or inf gradients.");
+    AddAttr<float>("incr_ratio",
+                   "The multiplier to use when increasing the loss scaling.")
+        .AddCustomChecker([](float incr_ratio) {
+          PADDLE_ENFORCE_EQ(incr_ratio > 1.0f, true,
+                            platform::errors::InvalidArgument(
+                                "'incr_ratio' should be greater than 1, but "
+                                "the received is %f",
+                                incr_ratio));
+        });
+    AddAttr<float>(
+        "decr_ratio",
+        "The less-than-one-multiplier to use when decreasing loss scaling.")
+        .AddCustomChecker([](float decr_ratio) {
+          PADDLE_ENFORCE_EQ(decr_ratio > 0.0f && decr_ratio < 1.0f, true,
+                            platform::errors::InvalidArgument(
+                                "'incr_ratio' should be between 0 and 1, but "
+                                "the received is %f",
+                                decr_ratio));
+        });
+    AddComment(R"DOC(
+Update loss scaling according to overall gradients. If all gradients is 
+finite after incr_every_n_steps, loss scaling will increase by incr_ratio. 
+Otherwise, loss scaling will decrease by decr_ratio after
+decr_every_n_nan_or_inf steps and each step some gradients are infinite.
+
+)DOC");
+  }
+};
+
+template <typename T>
+class UpdateLossScalingFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const bool* found_inf_data, const T* pre_loss_scaling_data,
+                  const int* good_in_data, const int* bad_in_data,
+                  const int incr_every_n_steps,
+                  const int decr_every_n_nan_or_inf, const float incr_ratio,
+                  const float decr_ratio, T* updated_loss_scaling_data,
+                  int* good_out_data, int* bad_out_data) const {
+    Update<T>(found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
+              incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
+              decr_ratio, updated_loss_scaling_data, good_out_data,
+              bad_out_data);
+  }
+};
+
+template <typename T>
+class LazyZeroInputs<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& dev_ctx,
+                  const bool* found_inf_data,
+                  const std::vector<const framework::Tensor*>& xs,
+                  const std::vector<framework::Tensor*>& outs) const {
+    if (*found_inf_data) {
+      VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --";
+      for (size_t i = 0; i < xs.size(); ++i) {
+        auto* out = outs[i];
+        T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+        int num = out->numel();
+        std::memset(out_data, 0, num * sizeof(T));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(
+    update_loss_scaling, ops::UpdateLossScalingOp,
+    ops::UpdateLossScalingOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(update_loss_scaling,
+                       ops::UpdateLossScalingKernel<CPU, float>,
+                       ops::UpdateLossScalingKernel<CPU, double>);
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cu b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2bc60423d247447adf18eb3ef050ca9b395a2e2f
--- /dev/null
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
@@ -0,0 +1,84 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void GpuUpdateLossScaling(
+    const bool* found_inf_data, const T* pre_loss_scaling_data,
+    const int* good_in_data, const int* bad_in_data,
+    const int incr_every_n_steps, const int decr_every_n_nan_or_inf,
+    const float incr_ratio, const float decr_ratio,
+    T* updated_loss_scaling_data, int* good_out_data, int* bad_out_data) {
+  Update<T>(found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
+            incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
+            updated_loss_scaling_data, good_out_data, bad_out_data);
+}
+
+template <typename T>
+class UpdateLossScalingFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& dev_ctx,
+                  const bool* found_inf_data, const T* pre_loss_scaling_data,
+                  const int* good_in_data, const int* bad_in_data,
+                  const int incr_every_n_steps,
+                  const int decr_every_n_nan_or_inf, const float incr_ratio,
+                  const float decr_ratio, T* updated_loss_scaling_data,
+                  int* good_out_data, int* bad_out_data) const {
+    GpuUpdateLossScaling<T><<<1, 1, 0, dev_ctx.stream()>>>(
+        found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
+        incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
+        updated_loss_scaling_data, good_out_data, bad_out_data);
+  }
+};
+
+template <typename T>
+class LazyZeroInputs<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& dev_ctx,
+                  const bool* found_inf_data,
+                  const std::vector<const framework::Tensor*>& xs,
+                  const std::vector<framework::Tensor*>& outs) const {
+    const auto gpu_place =
+        BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
+    bool has_inf{false};
+    memory::Copy(platform::CPUPlace(), &has_inf, gpu_place, found_inf_data,
+                 sizeof(bool), dev_ctx.stream());
+    if (has_inf) {
+      VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --";
+      for (size_t i = 0; i < xs.size(); ++i) {
+        auto* out = outs[i];
+        T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+        int num = out->numel();
+        cudaMemset(out_data, 0, num * sizeof(T));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using GPU = paddle::platform::CUDADeviceContext;
+
+REGISTER_OP_CUDA_KERNEL(update_loss_scaling,
+                        ops::UpdateLossScalingKernel<GPU, float>,
+                        ops::UpdateLossScalingKernel<GPU, double>);
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.h b/paddle/fluid/operators/amp/update_loss_scaling_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca23b72eff0e85ab94c4d1f11e986f69b4e2d776
--- /dev/null
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.h
@@ -0,0 +1,123 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cmath>
+#include <vector>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/errors.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+HOSTDEVICE void Update(const bool* found_inf_data,
+                       const T* pre_loss_scaling_data, const int* good_in_data,
+                       const int* bad_in_data, const int incr_every_n_steps,
+                       const int decr_every_n_nan_or_inf,
+                       const float incr_ratio, const float decr_ratio,
+                       T* updated_loss_scaling_data, int* good_out_data,
+                       int* bad_out_data) {
+  if (*found_inf_data) {
+    *good_out_data = 0;
+    *bad_out_data = *bad_in_data + 1;
+    if (*bad_out_data == decr_every_n_nan_or_inf) {
+      T new_loss_scaling = *pre_loss_scaling_data * decr_ratio;
+      *updated_loss_scaling_data = new_loss_scaling < static_cast<T>(1)
+                                       ? static_cast<T>(1)
+                                       : new_loss_scaling;
+      *bad_out_data = 0;
+    }
+  } else {
+    *bad_out_data = 0;
+    *good_out_data = *good_in_data + 1;
+    if (*good_out_data == incr_every_n_steps) {
+      T new_loss_scaling = *pre_loss_scaling_data * incr_ratio;
+      *updated_loss_scaling_data = std::isfinite(new_loss_scaling)
+                                       ? new_loss_scaling
+                                       : *pre_loss_scaling_data;
+      *good_out_data = 0;
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class UpdateLossScalingFunctor {
+ public:
+  void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data,
+                  const T* pre_loss_scaling_data, const int* good_in_data,
+                  const int* bad_in_data, const int incr_every_n_steps,
+                  const int decr_every_n_nan_or_inf, const float incr_ratio,
+                  const float decr_ratio, T* updated_loss_scaling_data,
+                  int* good_out_data, int* bad_out_data) const;
+};
+
+template <typename DeviceContext, typename T>
+class LazyZeroInputs {
+ public:
+  void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data,
+                  const std::vector<const framework::Tensor*>& xs,
+                  const std::vector<framework::Tensor*>& outs) const;
+};
+
+template <typename DeviceContext, typename T>
+class UpdateLossScalingKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto xs = ctx.MultiInput<framework::Tensor>("X");
+    const auto* found_inf = ctx.Input<Tensor>("FoundInfinite");
+    const auto* pre_loss_scaling = ctx.Input<Tensor>("PrevLossScaling");
+    const auto* good_in = ctx.Input<Tensor>("InGoodSteps");
+    const auto* bad_in = ctx.Input<Tensor>("InBadSteps");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto* updated_loss_scaling = ctx.Output<Tensor>("LossScaling");
+    auto* good_out = ctx.Output<Tensor>("OutGoodSteps");
+    auto* bad_out = ctx.Output<Tensor>("OutBadSteps");
+
+    PADDLE_ENFORCE_EQ(found_inf->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "FoundInfinite must has only one element."));
+
+    const bool* found_inf_data = found_inf->data<bool>();
+    const T* pre_loss_scaling_data = pre_loss_scaling->data<T>();
+    const int* good_in_data = good_in->data<int>();
+    const int* bad_in_data = bad_in->data<int>();
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    T* updated_loss_scaling_data =
+        updated_loss_scaling->mutable_data<T>(dev_ctx.GetPlace());
+    int* good_out_data = good_out->mutable_data<int>(dev_ctx.GetPlace());
+    int* bad_out_data = bad_out->mutable_data<int>(dev_ctx.GetPlace());
+
+    const int incr_every_n_steps = ctx.Attr<int>("incr_every_n_steps");
+    const int decr_every_n_nan_or_inf =
+        ctx.Attr<int>("decr_every_n_nan_or_inf");
+    const float incr_ratio = ctx.Attr<float>("incr_ratio");
+    const float decr_ratio = ctx.Attr<float>("decr_ratio");
+    UpdateLossScalingFunctor<DeviceContext, T>{}(
+        dev_ctx, found_inf_data, pre_loss_scaling_data, good_in_data,
+        bad_in_data, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
+        decr_ratio, updated_loss_scaling_data, good_out_data, bad_out_data);
+    LazyZeroInputs<DeviceContext, T>{}(dev_ctx, found_inf_data, xs, outs);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h
index c296ddcfbef703e8484b6ea0b7f96f037e415186..57e1c06f73c56334fc93dee7a16d6899f5a6f12a 100644
--- a/paddle/fluid/operators/arg_min_max_op_base.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
@@ -110,10 +110,12 @@ struct VisitDataArgMinMaxFunctor {
         CALL_ARG_MINMAX_FUNCTOR(6);
         break;
       default:
-        PADDLE_THROW(
-            "%s operator doesn't supports tensors whose ranks are greater "
-            "than 6.",
-            (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax"));
+        PADDLE_ENFORCE_LE(
+            x_dims.size(), 6,
+            platform::errors::InvalidArgument(
+                "%s operator doesn't supports tensors whose ranks are greater "
+                "than 6.",
+                (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax")));
         break;
 #undef CALL_ARG_MINMAX_FUNCTOR
     }
@@ -164,7 +166,8 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_LT(
         axis, x_dims.size(),
         platform::errors::InvalidArgument(
-            "'axis'(%d) must be less than Rank(X)(%d).", axis, x_dims.size()));
+            "'axis'(%d) must be less than Rank(X)(%d) of Input(X).", axis,
+            x_dims.size()));
 
     const int& dtype = ctx->Attrs().Get<int>("dtype");
     PADDLE_ENFORCE_EQ(
@@ -192,10 +195,11 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
         }
         PADDLE_ENFORCE_LE(
             all_element_num, INT_MAX,
-            "The element num of the argmin/argmax input at axis is "
-            "%d, is larger than int32 maximum value:%d, you must "
-            "set the dtype of argmin/argmax to 'int64'.",
-            all_element_num, INT_MAX);
+            platform::errors::InvalidArgument(
+                "The element num of the argmin/argmax input at axis is "
+                "%d, is larger than int32 maximum value:%d, you must "
+                "set the dtype of argmin/argmax to 'int64'.",
+                all_element_num, INT_MAX));
       }
     }
     std::vector<int64_t> vec;
diff --git a/paddle/fluid/operators/assign_op.h b/paddle/fluid/operators/assign_op.h
index 6ce04d19fc4376e4263712e2904e480e26590553..c2154f78bbe97418f2c7388a000dc833134d0c84 100644
--- a/paddle/fluid/operators/assign_op.h
+++ b/paddle/fluid/operators/assign_op.h
@@ -52,7 +52,10 @@ class AssignFunctor {
 
   template <typename T>
   void operator()(const T &v) const {
-    PADDLE_THROW("Not support type for assign op %s", typeid(T).name());
+    PADDLE_ENFORCE_EQ(
+        true, false,
+        platform::errors::PermissionDenied(
+            "Not support type for assign op with type %s", typeid(T).name()));
   }
 
  private:
diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc
index 60f29ba39a8ee64f9fe5d95e685cac1fb52dfd21..4940649c2a32649a068c364081071ac840b4e25a 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
@@ -111,8 +111,16 @@ class CompareOp : public framework::OperatorWithKernel {
     framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
     // CompareOp kernel's device type is decided by input tensor place
     bool force_cpu = ctx.Attr<bool>("force_cpu");
-    kt.place_ = force_cpu ? platform::CPUPlace()
-                          : ctx.Input<framework::LoDTensor>("X")->place();
+    if (force_cpu) {
+      kt.place_ = platform::CPUPlace();
+    } else {
+      if (ctx.Input<framework::LoDTensor>("X")->place().type() !=
+          typeid(platform::CUDAPinnedPlace)) {
+        kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
+      } else {
+        kt.place_ = ctx.GetPlace();
+      }
+    }
     return kt;
   }
 };
diff --git a/paddle/fluid/operators/correlation_op.cc b/paddle/fluid/operators/correlation_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a2e6ff214bfa30af2e25b7feac41d22f02ab75a7
--- /dev/null
+++ b/paddle/fluid/operators/correlation_op.cc
@@ -0,0 +1,181 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+inline std::vector<int64_t> CorrelationOutputSize(int batch, int input_height,
+                                                  int input_width, int stride1,
+                                                  int stride2, int kernel_size,
+                                                  int pad_size,
+                                                  int max_displacement) {
+  std::vector<int64_t> output_shape({batch});
+  int kernel_radius = (kernel_size - 1) / 2;
+  int border_radius = kernel_radius + max_displacement;
+  int padded_input_height = input_height + 2 * pad_size;
+  int padded_input_width = input_width + 2 * pad_size;
+  int output_channel = ((max_displacement / stride2) * 2 + 1) *
+                       ((max_displacement / stride2) * 2 + 1);
+  output_shape.push_back(output_channel);
+  int output_height =
+      std::ceil(static_cast<float>(padded_input_height - 2 * border_radius) /
+                static_cast<float>(stride1));
+  int output_width =
+      std::ceil(static_cast<float>(padded_input_width - 2 * border_radius) /
+                static_cast<float>(stride1));
+  output_shape.push_back(output_height);
+  output_shape.push_back(output_width);
+  return output_shape;
+}
+
+class CorrelationOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Input1", "Input is a 4-D Tensor with shape [N, C, H, W]");
+    AddInput("Input2", "Input is a 4-D Tensor with shape [N, C, H, W]");
+    AddOutput("Output",
+              "(Tensor) The output tensor of correlation operator. "
+              "It has same data fromat and data type as the Input.");
+    AddAttr<int>("pad_size", "pad size for input1 and input2");
+    AddAttr<int>("kernel_size", "kernel size of input1 and input2");
+    AddAttr<int>("max_displacement", "max displacement of input1 and input2");
+    AddAttr<int>("stride1", "Input1 stride");
+    AddAttr<int>("stride2", "Input2 stride");
+    AddAttr<int>("corr_type_multiply", "correlation coefficient").SetDefault(1);
+    AddComment(
+        R"DOC(Correlation of two feature map. Only support NCHW data format.)DOC");
+  }
+};
+
+class CorrelationOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Input1"), "Input", "X", "CorrelationOp");
+    OP_INOUT_CHECK(ctx->HasInput("Input2"), "Input", "Y", "CorrelationOp");
+    int stride1 = ctx->Attrs().Get<int>("stride1");
+    int stride2 = ctx->Attrs().Get<int>("stride2");
+    int max_displacement = ctx->Attrs().Get<int>("max_displacement");
+    int pad_size = ctx->Attrs().Get<int>("pad_size");
+    int kernel_size = ctx->Attrs().Get<int>("kernel_size");
+
+    auto in_dims = ctx->GetInputDim("Input1");
+    auto in2_dims = ctx->GetInputDim("Input2");
+
+    PADDLE_ENFORCE_EQ(in_dims.size() == 4, true,
+                      platform::errors::InvalidArgument(
+                          "Input(X) of CorrelationOp must be 4 dims."
+                          "But received dims is %d.",
+                          in_dims.size()));
+
+    PADDLE_ENFORCE_EQ(in2_dims.size() == 4, true,
+                      platform::errors::InvalidArgument(
+                          "Input(Y) of CorrelationOp must be 4 dims."
+                          "But received dims is %d.",
+                          in2_dims.size()));
+    std::vector<int64_t> output_shape =
+        CorrelationOutputSize(in_dims[0], in_dims[2], in_dims[3], stride1,
+                              stride2, kernel_size, pad_size, max_displacement);
+    ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type =
+        OperatorWithKernel::IndicateVarDataType(ctx, "Input1");
+    PADDLE_ENFORCE_EQ(input_data_type, ctx.Input<Tensor>("Input2")->type(),
+                      platform::errors::InvalidArgument(
+                          "X and Y shoule have the same datatype"));
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+template <typename T>
+class CorrelationOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("correlation_grad");
+    op->SetInput("Input1", this->Input("Input1"));
+    op->SetInput("Input2", this->Input("Input2"));
+    op->SetInput(framework::GradVarName("Output"), this->OutputGrad("Output"));
+    op->SetOutput(framework::GradVarName("Input1"), this->InputGrad("Input1"));
+    op->SetOutput(framework::GradVarName("Input2"), this->InputGrad("Input2"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+class CorrelationOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Input1"), "Input", "X", "CorrelationOp");
+    OP_INOUT_CHECK(ctx->HasInput("Input2"), "Input", "Y", "CorrelationOp");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Output")), "Input",
+                   "Output@GRAD", "CorrelationGradOp");
+
+    auto in1_dims = ctx->GetInputDim("Input1");
+    auto in2_dims = ctx->GetInputDim("Input2");
+    ctx->SetOutputDim(framework::GradVarName("Input1"), in1_dims);
+    ctx->SetOutputDim(framework::GradVarName("Input2"), in2_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "Input1"), ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class CorrelationKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::Unimplemented("Correlation only supports GPU now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(correlation, ops::CorrelationOp, ops::CorrelationOpMaker,
+                  ops::CorrelationOpGradMaker<paddle::framework::OpDesc>,
+                  ops::CorrelationOpGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(correlation_grad, ops::CorrelationOpGrad);
+REGISTER_OP_CPU_KERNEL(correlation, ops::CorrelationKernel<float>,
+                       ops::CorrelationKernel<double>);
diff --git a/paddle/fluid/operators/correlation_op.cu b/paddle/fluid/operators/correlation_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0d177f653ec3d5cacbd4d938e6e5da6689b1bc74
--- /dev/null
+++ b/paddle/fluid/operators/correlation_op.cu
@@ -0,0 +1,483 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+#define THREADS_PER_BLOCK 32
+#define FULL_MASK 0xffffffff
+
+using framework::Tensor;
+using DataLayout = framework::DataLayout;
+
+template <typename T>
+__forceinline__ __device__ T warpReduceSum(T val) {
+  for (int offset = 16; offset > 0; offset /= 2) {
+    val += __shfl_down_sync(FULL_MASK, val, offset);
+  }
+  return val;
+}
+
+template <typename T>
+__forceinline__ __device__ T blockReduceSum(T val) {
+  static __shared__ T shared[32];
+  int lane = threadIdx.x % warpSize;
+  int wid = threadIdx.x / warpSize;
+
+  val = warpReduceSum(val);
+  if (lane == 0) shared[wid] = val;
+
+  __syncthreads();
+  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
+
+  if (wid == 0) val = warpReduceSum(val);
+
+  return val;
+}
+
+template <typename T>
+__global__ void set_zero(T *x, int num) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x)
+    x[i] = static_cast<T>(0);
+}
+
+template <typename T>
+__global__ void channel_first(const T *input, T *rinput, const int channel,
+                              const int height, const int width,
+                              const int pad_size) {
+  int n = blockIdx.x;
+  int h = blockIdx.y;
+  int w = blockIdx.z;
+
+  int ch_off = threadIdx.x;
+  T value;
+  int dimchw = channel * height * width;
+  int dimhw = height * width;
+
+  int p_dimw = (width + 2 * pad_size);
+  int p_dimh = (height + 2 * pad_size);
+  int p_dimchw = channel * p_dimw * p_dimh;
+  int p_dimcw = channel * p_dimw;
+
+  for (int c = ch_off; c < channel; c += THREADS_PER_BLOCK) {
+    value = input[n * dimchw + c * dimhw + h * width + w];
+    rinput[n * p_dimchw + (h + pad_size) * p_dimcw + (w + pad_size) * channel +
+           c] = value;
+  }
+}
+
+template <typename T>
+__global__ void correlation_forward(
+    T *output, const int output_channel, const int output_height,
+    const int output_width, const T *rinput1, const int input_channel,
+    const int input_height, const int input_width, const T *rinput2,
+    const int pad_size, const int kernel_size, const int max_displacement,
+    const int stride1, const int stride2) {
+  int p_input_width = input_width + 2 * pad_size;
+  int p_input_height = input_height + 2 * pad_size;
+
+  int kernel_rad = (kernel_size - 1) / 2;
+  int displacement_rad = max_displacement / stride2;
+
+  int displacement_size = 2 * displacement_rad + 1;
+
+  int n = blockIdx.x;
+  int h1 = blockIdx.y * stride1 + max_displacement;
+  int w1 = blockIdx.z * stride1 + max_displacement;
+  int c = threadIdx.x;
+
+  int p_dimchw = p_input_height * p_input_width * input_channel;
+  int p_dimcw = p_input_width * input_channel;
+  int p_dimc = input_channel;
+
+  int t_dimchw = output_channel * output_height * output_width;
+  int t_dimhw = output_height * output_width;
+  int t_dimw = output_width;
+
+  int nelems = kernel_size * kernel_size * p_dimc;
+
+  for (int tj = -displacement_rad; tj <= displacement_rad; ++tj) {
+    for (int ti = -displacement_rad; ti <= displacement_rad; ++ti) {
+      int w2 = w1 + ti * stride2;
+      int h2 = h1 + tj * stride2;
+
+      T acc0 = 0;
+      for (int j = -kernel_rad; j <= kernel_rad; ++j) {
+        for (int i = -kernel_rad; i <= kernel_rad; ++i) {
+          for (int ch = c; ch < p_dimc; ch += blockDim.x) {
+            int index1 =
+                n * p_dimchw + (h1 + j) * p_dimcw + (w1 + i) * p_dimc + ch;
+            int index2 =
+                n * p_dimchw + (h2 + j) * p_dimcw + (w2 + i) * p_dimc + ch;
+            acc0 += static_cast<T>(rinput1[index1] * rinput2[index2]);
+          }
+        }
+      }
+      if (blockDim.x == warpSize) {
+        __syncwarp();
+        acc0 = warpReduceSum(acc0);
+      } else {
+        __syncthreads();
+        acc0 = blockReduceSum(acc0);
+      }
+
+      if (threadIdx.x == 0) {
+        int tc = (tj + displacement_rad) * displacement_size +
+                 (ti + displacement_rad);
+        const int t_index =
+            n * t_dimchw + tc * t_dimhw + blockIdx.y * t_dimw + blockIdx.z;
+        output[t_index] = static_cast<T>(acc0 / nelems);
+      }
+    }
+  }
+}
+
+// class CorrelationKernel<platform::CUDADeviceContext, T>
+template <typename T>
+class CorrelationCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
+                      platform::errors::InvalidArgument(
+                          "Correlation only supports GPU now."));
+
+    auto *input1 = ctx.Input<Tensor>("Input1");
+    auto *input2 = ctx.Input<Tensor>("Input2");
+    int pad_size = ctx.Attr<int>("pad_size");
+    int kernel_size = ctx.Attr<int>("kernel_size");
+    int stride1 = ctx.Attr<int>("stride1");
+    int stride2 = ctx.Attr<int>("stride2");
+    int max_displacement = ctx.Attr<int>("max_displacement");
+    int corr_type_multiply = ctx.Attr<int>("corr_type_multiply");
+
+    auto *output = ctx.Output<Tensor>("Output");
+    output->mutable_data<T>(ctx.GetPlace());
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    // base on input1, NCHW
+    auto in_dims = input1->dims();
+    int N = in_dims[0];
+    int C = in_dims[1];
+    int H = in_dims[2];
+    int W = in_dims[3];
+
+    int padded_input_height = H + 2 * pad_size;
+    int padded_input_width = W + 2 * pad_size;
+
+    Tensor rinput1 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>(
+        {N, padded_input_height, padded_input_width, C}, dev_ctx);
+    rinput1.mutable_data<T>(ctx.GetPlace());
+
+    Tensor rinput2 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>(
+        {N, padded_input_height, padded_input_width, C}, dev_ctx);
+    rinput2.mutable_data<T>(ctx.GetPlace());
+
+    set_zero<<<(rinput1.numel() + 512 - 1) / 512, 512, 0, dev_ctx.stream()>>>(
+        rinput1.data<T>(), rinput1.numel());
+    set_zero<<<(rinput2.numel() + 512 - 1) / 512, 512, 0, dev_ctx.stream()>>>(
+        rinput2.data<T>(), rinput2.numel());
+    set_zero<<<(output->numel() + 512 - 1) / 512, 512, 0, dev_ctx.stream()>>>(
+        output->data<T>(), output->numel());
+
+    auto out_dims = output->dims();
+    int OC = out_dims[1];
+    int OH = out_dims[2];
+    int OW = out_dims[3];
+
+    dim3 blocks_grid(N, H, W);
+    dim3 threads_block(THREADS_PER_BLOCK);
+
+    channel_first<T><<<blocks_grid, threads_block, 0, dev_ctx.stream()>>>(
+        input1->data<T>(), rinput1.data<T>(), C, H, W, pad_size);
+    channel_first<T><<<blocks_grid, threads_block, 0, dev_ctx.stream()>>>(
+        input2->data<T>(), rinput2.data<T>(), C, H, W, pad_size);
+
+    dim3 threadsPerBlock(THREADS_PER_BLOCK);
+    dim3 totalBlocksCorr(N, OH, OW);
+
+    correlation_forward<
+        T><<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(
+        output->data<T>(), OC, OH, OW, rinput1.data<T>(), C, H, W,
+        rinput2.data<T>(), pad_size, kernel_size, max_displacement, stride1,
+        stride2);
+  }
+};
+
+template <typename T>
+__global__ void correlation_backward_input1(
+    int item, T *grad_input1, const int input_channel, const int input_height,
+    const int input_width, const T *grad_output, const int output_channel,
+    const int output_height, const int output_width, const T *rinput2,
+    const int pad_size, const int kernel_size, const int max_displacement,
+    const int stride1, const int stride2) {
+  int n = item;
+  int h = blockIdx.x * stride1 + pad_size;
+  int w = blockIdx.y * stride1 + pad_size;
+  int c = blockIdx.z;
+  int tch_off = threadIdx.x;
+
+  int kernel_rad = (kernel_size - 1) / 2;
+  int displacement_rad = max_displacement / stride2;
+  int displacement_size = 2 * displacement_rad + 1;
+
+  int xmin = (w - kernel_rad - max_displacement) / stride1;
+  int ymin = (h - kernel_rad - max_displacement) / stride1;
+
+  int xmax = (w + kernel_rad - max_displacement) / stride1;
+  int ymax = (h + kernel_rad - max_displacement) / stride1;
+
+  if (xmax < 0 || ymax < 0 || xmin >= output_width || ymin >= output_height) {
+    return;
+  }
+
+  if (xmin > xmax || ymin > ymax) {
+    return;
+  }
+
+  xmin = max(0, xmin);
+  xmax = min(output_width - 1, xmax);
+
+  ymin = max(0, ymin);
+  ymax = min(output_height - 1, ymax);
+
+  int p_input_width = input_width + 2 * pad_size;
+  int p_input_height = input_height + 2 * pad_size;
+  int p_dimchw = input_channel * p_input_height * p_input_width;
+  int p_dimcw = input_channel * p_input_width;
+  int p_dimc = input_channel;
+
+  int t_dimchw = output_channel * output_height * output_width;
+  int t_dimhw = output_height * output_width;
+  int t_dimw = output_width;
+
+  int o_dimchw = input_channel * input_height * input_width;
+  int o_dimhw = input_height * input_width;
+  int o_dimw = input_width;
+
+  int nelems = kernel_size * kernel_size * input_channel;
+
+  __shared__ T prod_sum[THREADS_PER_BLOCK];
+  prod_sum[tch_off] = 0;
+
+  for (int tc = tch_off; tc < output_channel; tc += THREADS_PER_BLOCK) {
+    int i2 = (tc % displacement_size - displacement_rad) * stride2;
+    int j2 = (tc / displacement_size - displacement_rad) * stride2;
+
+    int index2 = n * p_dimchw + (h + j2) * p_dimcw + (w + i2) * p_dimc + c;
+
+    T val2 = rinput2[index2];
+    for (int j = ymin; j <= ymax; ++j) {
+      for (int i = xmin; i <= xmax; ++i) {
+        int t_index = n * t_dimchw + tc * t_dimhw + j * t_dimw + i;
+        prod_sum[tch_off] += grad_output[t_index] * val2;
+      }
+    }
+  }
+
+  __syncthreads();
+
+  if (tch_off == 0) {
+    T reduce_sum = 0;
+    for (int index = 0; index < THREADS_PER_BLOCK; index++) {
+      reduce_sum += prod_sum[index];
+    }
+    const int index1 =
+        n * o_dimchw + c * o_dimhw + (h - pad_size) * o_dimw + (w - pad_size);
+    grad_input1[index1] = static_cast<T>(reduce_sum / nelems);
+  }
+}
+
+template <typename T>
+__global__ void correlation_backward_input2(
+    int item, T *grad_input2, const int input_channel, const int input_height,
+    const int input_width, const T *grad_output, const int output_channel,
+    const int output_height, const int output_width, const T *rinput1,
+    const int pad_size, const int kernel_size, const int max_displacement,
+    const int stride1, const int stride2) {
+  int n = item;
+  int h = blockIdx.x * stride1 + pad_size;
+  int w = blockIdx.y * stride1 + pad_size;
+  int c = blockIdx.z;
+
+  int tch_off = threadIdx.x;
+
+  int kernel_rad = (kernel_size - 1) / 2;
+  int displacement_rad = max_displacement / stride2;
+  int displacement_size = 2 * displacement_rad + 1;
+
+  int p_input_width = input_width + 2 * pad_size;
+  int p_input_height = input_height + 2 * pad_size;
+  int p_dimchw = input_channel * p_input_height * p_input_width;
+  int p_dimcw = input_channel * p_input_width;
+  int p_dimc = input_channel;
+
+  int t_dimchw = output_channel * output_height * output_width;
+  int t_dimhw = output_height * output_width;
+  int t_dimw = output_width;
+
+  int o_dimchw = input_channel * input_height * input_width;
+  int o_dimhw = input_height * input_width;
+  int o_dimw = input_width;
+
+  int nelems = kernel_size * kernel_size * input_channel;
+
+  __shared__ T prod_sum[THREADS_PER_BLOCK];
+  prod_sum[tch_off] = 0;
+
+  for (int tc = tch_off; tc < output_channel; tc += THREADS_PER_BLOCK) {
+    int i2 = (tc % displacement_size - displacement_rad) * stride2;
+    int j2 = (tc / displacement_size - displacement_rad) * stride2;
+
+    int xmin = (w - kernel_rad - max_displacement - i2) / stride1;
+    int ymin = (h - kernel_rad - max_displacement - j2) / stride1;
+
+    int xmax = (w + kernel_rad - max_displacement - i2) / stride1;
+    int ymax = (h + kernel_rad - max_displacement - j2) / stride1;
+
+    if (xmax < 0 || ymax < 0 || xmin >= output_width || ymin >= output_height) {
+      continue;
+    }
+
+    if (xmin > xmax || ymin > ymax) {
+      continue;
+    }
+
+    xmin = max(0, xmin);
+    xmax = min(output_width - 1, xmax);
+
+    ymin = max(0, ymin);
+    ymax = min(output_height - 1, ymax);
+
+    int index1 = n * p_dimchw + (h - j2) * p_dimcw + (w - i2) * p_dimc + c;
+    T val1 = rinput1[index1];
+    for (int j = ymin; j <= ymax; ++j) {
+      for (int i = xmin; i <= xmax; ++i) {
+        int t_index = n * t_dimchw + tc * t_dimhw + j * t_dimw + i;
+        prod_sum[tch_off] += grad_output[t_index] * val1;
+      }
+    }
+  }
+
+  __syncthreads();
+
+  if (tch_off == 0) {
+    T reduce_sum = 0;
+    for (int index = 0; index < THREADS_PER_BLOCK; index++) {
+      reduce_sum += prod_sum[index];
+    }
+    const int index2 =
+        n * o_dimchw + c * o_dimhw + (h - pad_size) * o_dimw + (w - pad_size);
+    grad_input2[index2] = static_cast<T>(reduce_sum / nelems);
+  }
+}
+
+template <typename T>
+class CorrelationCUDAGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
+                      platform::errors::InvalidArgument(
+                          "Correlation only supports GPU now."));
+    const auto *input1 = ctx.Input<Tensor>("Input1");
+    const auto *input2 = ctx.Input<Tensor>("Input2");
+    const auto *grad_output =
+        ctx.Input<Tensor>(framework::GradVarName("Output"));
+    const int pad_size = ctx.Attr<int>("pad_size");
+    const int kernel_size = ctx.Attr<int>("kernel_size");
+    const int stride1 = ctx.Attr<int>("stride1");
+    const int stride2 = ctx.Attr<int>("stride2");
+    const int max_displacement = ctx.Attr<int>("max_displacement");
+    const int corr_type_multiply = ctx.Attr<int>("corr_type_multiply");
+
+    auto *grad_input1 = ctx.Output<Tensor>(framework::GradVarName("Input1"));
+    grad_input1->mutable_data<T>(ctx.GetPlace());
+    auto *grad_input2 = ctx.Output<Tensor>(framework::GradVarName("Input2"));
+    grad_input2->mutable_data<T>(ctx.GetPlace());
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    auto in_dims = input1->dims();
+    int N = in_dims[0];
+    int C = in_dims[1];
+    int H = in_dims[2];
+    int W = in_dims[3];
+
+    int padded_input_height = H + 2 * pad_size;
+    int padded_input_width = W + 2 * pad_size;
+
+    Tensor rinput1 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>(
+        {N, padded_input_height, padded_input_width, C}, dev_ctx);
+    rinput1.mutable_data<T>(ctx.GetPlace());
+
+    Tensor rinput2 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>(
+        {N, padded_input_height, padded_input_width, C}, dev_ctx);
+    rinput2.mutable_data<T>(ctx.GetPlace());
+
+    set_zero<<<(rinput1.numel() + 512 - 1) / 512, 512, 0, dev_ctx.stream()>>>(
+        rinput1.data<T>(), rinput1.numel());
+    set_zero<<<(rinput2.numel() + 512 - 1) / 512, 512, 0, dev_ctx.stream()>>>(
+        rinput2.data<T>(), rinput2.numel());
+    set_zero<<<(grad_input1->numel() + 512 - 1) / 512, 512, 0,
+               dev_ctx.stream()>>>(grad_input1->data<T>(),
+                                   grad_input1->numel());
+    set_zero<<<(grad_input2->numel() + 512 - 1) / 512, 512, 0,
+               dev_ctx.stream()>>>(grad_input2->data<T>(),
+                                   grad_input2->numel());
+
+    auto grad_out_dims = grad_output->dims();
+    int GOC = grad_out_dims[1];
+    int GOH = grad_out_dims[2];
+    int GOW = grad_out_dims[3];
+
+    dim3 blocks_grid(N, H, W);
+    dim3 threads_block(THREADS_PER_BLOCK);
+
+    channel_first<T><<<blocks_grid, threads_block, 0, dev_ctx.stream()>>>(
+        input1->data<T>(), rinput1.data<T>(), C, H, W, pad_size);
+    channel_first<T><<<blocks_grid, threads_block, 0, dev_ctx.stream()>>>(
+        input2->data<T>(), rinput2.data<T>(), C, H, W, pad_size);
+
+    dim3 threadsPerBlock(THREADS_PER_BLOCK);
+    dim3 totalBlocksCorr(H, W, C);
+
+    for (int n = 0; n < N; n++) {
+      correlation_backward_input1<
+          T><<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(
+          n, grad_input1->data<T>(), C, H, W, grad_output->data<T>(), GOC, GOH,
+          GOW, rinput2.data<T>(), pad_size, kernel_size, max_displacement,
+          stride1, stride2);
+    }
+
+    for (int n = 0; n < N; n++) {
+      correlation_backward_input2<
+          T><<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(
+          n, grad_input2->data<T>(), C, H, W, grad_output->data<T>(), GOC, GOH,
+          GOW, rinput1.data<T>(), pad_size, kernel_size, max_displacement,
+          stride1, stride2);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(correlation, ops::CorrelationCUDAKernel<float>,
+                        ops::CorrelationCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(correlation_grad, ops::CorrelationCUDAGradKernel<float>,
+                        ops::CorrelationCUDAGradKernel<double>);
diff --git a/paddle/fluid/operators/cudnn_lstm_cache.h b/paddle/fluid/operators/cudnn_lstm_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b46e2b475e8bd59c59744bdfde7bfb1248bc99a
--- /dev/null
+++ b/paddle/fluid/operators/cudnn_lstm_cache.h
@@ -0,0 +1,166 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/dynload/cudnn.h"
+
+namespace paddle {
+namespace operators {
+
+class ScopedRNNBase {
+ public:
+  ScopedRNNBase(int seq_length, int batch_size, int input_size, int hidden_size,
+                int num_layers, float dropout_prob, int seed, int weight_numel,
+                bool initialized, bool is_bidirec)
+      : seq_length_(seq_length),
+        batch_size_(batch_size),
+        input_size_(input_size),
+        hidden_size_(hidden_size),
+        num_layers_(num_layers),
+        dropout_prob_(dropout_prob),
+        seed_(seed),
+        weight_numel_(weight_numel),
+        initialized_(initialized),
+        is_bidirec_(is_bidirec) {}
+
+  template <typename T>
+  void Create(const cudnnHandle_t& handle, const platform::Place& place,
+              const std::vector<int>& sequence_length, size_t* workspace_size,
+              size_t* reserve_size, framework::Tensor* dropout_state) {
+    int numDirections = is_bidirec_ ? 2 : 1;
+    cudnnDataType_t cudnn_type = platform::CudnnDataType<T>::type;
+
+    // ------------------- cudnn x, y descriptors ---------------------
+    std::vector<int> dims_x = {batch_size_, input_size_, 1};
+    std::vector<int> strides_x = {input_size_, 1, 1};
+    std::vector<int> dims_y = {batch_size_, hidden_size_ * numDirections, 1};
+    std::vector<int> strides_y = {hidden_size_ * numDirections, 1, 1};
+    for (int i = 0; i < seq_length_; ++i) {
+      x_descs_.emplace_back(x_desc_.descriptor<T>(dims_x, strides_x));
+      y_descs_.emplace_back(y_desc_.descriptor<T>(dims_y, strides_y));
+    }
+    if (!sequence_length.empty()) {
+      x_seq_desc_.descriptor<T>(seq_length_, batch_size_, input_size_, true,
+                                sequence_length);
+      y_seq_desc_.descriptor<T>(seq_length_, batch_size_,
+                                hidden_size_ * numDirections, true,
+                                sequence_length);
+    }
+
+    // ------------------- cudnn hx, hy, cx, cy descriptors----------
+    std::vector<int> dims_hx = {num_layers_ * numDirections, batch_size_,
+                                hidden_size_};
+    std::vector<int> strides_hx = {hidden_size_ * batch_size_, hidden_size_, 1};
+    init_h_desc_.descriptor<T>(dims_hx, strides_hx);
+    init_c_desc_.descriptor<T>(dims_hx, strides_hx);
+    last_h_desc_.descriptor<T>(dims_hx, strides_hx);
+    last_c_desc_.descriptor<T>(dims_hx, strides_hx);
+
+    // ------------------- cudnn dropout descriptors ---------------------
+    size_t state_size;
+    if (!initialized_) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
+      dropout_state->mutable_data<uint8_t>({static_cast<int64_t>(state_size)},
+                                           place);
+    }
+    dropout_desc_.descriptor(handle, place, initialized_, dropout_prob_,
+                             dropout_state, seed_, state_size);
+
+// ------------------- cudnn rnn descriptors ---------------------
+#if CUDNN_VERSION >= 6000
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
+        handle, rnn_desc_.desc(), hidden_size_, num_layers_,
+        dropout_desc_.desc(), CUDNN_LINEAR_INPUT,
+        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
+        CUDNN_RNN_ALGO_STANDARD, cudnn_type));
+#else
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor(
+        rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(),
+        CUDNN_LINEAR_INPUT,
+        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
+        cudnn_type));
+#endif
+    if (!sequence_length.empty()) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode(
+          rnn_desc_.desc(), CUDNN_RNN_PADDED_IO_ENABLED));
+    }
+
+    // ------------------- cudnn weights_size ---------------------
+    size_t weights_size_;
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNParamsSize(
+        handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
+    PADDLE_ENFORCE_EQ(
+        weights_size_, sizeof(T) * weight_numel_,
+        platform::errors::InvalidArgument(
+            "The cudnn lstm and setting weight size should be same."));
+    // ------------------- cudnn weight descriptors ---------------------
+    platform::DataLayout layout = platform::DataLayout::kNCHW;
+    int dim_tmp = weights_size_ / sizeof(T);
+    std::vector<int> dim_w = {dim_tmp, 1, 1};
+    weight_desc_.descriptor<T>(layout, dim_w);
+    // ------------------- cudnn workspace, reserve size ---------------------
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize(
+        handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
+        workspace_size));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnGetRNNTrainingReserveSize(
+            handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
+            reserve_size));
+  }
+  cudnnTensorDescriptor_t* x_descs() { return x_descs_.data(); }
+  cudnnTensorDescriptor_t* y_descs() { return y_descs_.data(); }
+  cudnnRNNDataDescriptor_t x_seq_desc() { return x_seq_desc_.desc(); }
+  cudnnRNNDataDescriptor_t y_seq_desc() { return y_seq_desc_.desc(); }
+  cudnnTensorDescriptor_t init_h_desc() { return init_h_desc_.desc(); }
+  cudnnTensorDescriptor_t init_c_desc() { return init_c_desc_.desc(); }
+  cudnnTensorDescriptor_t last_h_desc() { return last_h_desc_.desc(); }
+  cudnnTensorDescriptor_t last_c_desc() { return last_c_desc_.desc(); }
+  cudnnRNNDescriptor_t rnn_desc() { return rnn_desc_.desc(); }
+  cudnnDropoutDescriptor_t dropout_desc() { return dropout_desc_.desc(); }
+  cudnnFilterDescriptor_t weight_desc() { return weight_desc_.desc(); }
+
+ private:
+  int seq_length_;
+  int batch_size_;
+  int input_size_;
+  int hidden_size_;
+  int num_layers_;
+  float dropout_prob_;
+  int seed_;
+  int weight_numel_;
+  bool initialized_;
+  bool is_bidirec_;
+  std::vector<cudnnTensorDescriptor_t> x_descs_;
+  std::vector<cudnnTensorDescriptor_t> y_descs_;
+
+  platform::ScopedTensorDescriptor x_desc_;
+  platform::ScopedTensorDescriptor y_desc_;
+  platform::ScopedRNNTensorDescriptor x_seq_desc_;
+  platform::ScopedRNNTensorDescriptor y_seq_desc_;
+  platform::ScopedTensorDescriptor init_h_desc_;
+  platform::ScopedTensorDescriptor init_c_desc_;
+  platform::ScopedTensorDescriptor last_h_desc_;
+  platform::ScopedTensorDescriptor last_c_desc_;
+  platform::ScopedDropoutDescriptor dropout_desc_;
+  platform::ScopedFilterDescriptor weight_desc_;
+  platform::ScopedRNNDescriptor rnn_desc_;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc
index cc807f193ed835cfbf04dfcefad7ffb24e8ab286..82954bc109a740c0fe31ab889eb07bbbe3f52417 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@@ -51,6 +51,16 @@ class CudnnLSTMOp : public framework::OperatorWithKernel {
                           "received InitH's rank is %d.",
                           init_h_dims.size()));
 
+    if (ctx->HasInput("SequenceLength")) {
+      auto seq_dims = ctx->GetInputDim("SequenceLength");
+      PADDLE_ENFORCE_EQ(
+          in_dims[1], seq_dims[0],
+          platform::errors::InvalidArgument(
+              "The size of SequenceLength has to equal the batch_size. But "
+              "received batch_size is %d and the size of SequenceLength is %d.",
+              in_dims[1], seq_dims[0]));
+    }
+
     PADDLE_ENFORCE_EQ(
         in_dims[1], init_h_dims[1],
         platform::errors::InvalidArgument(
@@ -113,6 +123,12 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor) the learnable hidden-hidden weights."
              " The shape is (N), where N is total weight size of the LSTM. "
              " cudnn concatenate all the weight to one Tensor");
+    AddInput("SequenceLength",
+             "(Tensor) When the input data is padding, "
+             "set this parameter. This parameter represents "
+             "the variable sequence lengths in a batch. "
+             "The size of the vector has to equal the batch_size.")
+        .AsDispensable();
     AddOutput("Reserve",
               "(Tensor, a temporary output Tensor to store the reserve_data "
               "of cudnn kernel.")
@@ -155,13 +171,6 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(1);
     AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
     AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(0);
-    AddAttr<std::vector<int>>("sequence_length",
-                              "(vector<int>) When the input data is padding, "
-                              "set this parameter. This parameter represents "
-                              "the variable sequence"
-                              "lengths in a batch. The size of the vector has "
-                              "to equal the batch_size.")
-        .SetDefault({});
     AddComment(R"DOC(
 CUDNN LSTM implementation
 
@@ -243,6 +252,9 @@ class CudnnLSTMGradOpMaker : public framework::SingleGradOpMaker<T> {
     op->SetInput("InitH", this->Input("InitH"));
     op->SetInput("InitC", this->Input("InitC"));
     op->SetInput("W", this->Input("W"));
+    if (this->HasInput("SequenceLength")) {
+      op->SetInput("SequenceLength", this->Input("SequenceLength"));
+    }
     op->SetInput("Reserve", this->Output("Reserve"));
     op->SetInput("StateOut", this->Output("StateOut"));
     op->SetInput("Out", this->Output("Out"));
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index f60cd41d9a218c444254d268eb43abfb97db43e6..6457d9295dcbfa99d18f63fbda3dae048d7713cd 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/cudnn_rnn_cache.h"
+#include "paddle/fluid/operators/cudnn_lstm_cache.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/cudnn_desc.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 
@@ -24,6 +25,43 @@ namespace operators {
 using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
 
+template <typename T>
+void LSTMInferece(const bool &has_seq_length, const cudnnHandle_t &handle,
+                  const int &seq_length, ScopedRNNBase *rnn, const T *x_data,
+                  const T *init_h_data, const T *init_c_data, const T *w_data,
+                  T *out_data, T *last_h_data, T *last_c_data,
+                  framework::Tensor *workspace_data,
+                  const size_t &workspace_size) {
+  if (!has_seq_length) {
+    // for inference
+    // This interface is used when the input/output is unpadded.
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference(
+        handle, rnn->rnn_desc(), seq_length, rnn->x_descs(), x_data,
+        rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data,
+        rnn->weight_desc(), w_data, rnn->y_descs(), out_data,
+        rnn->last_h_desc(), last_h_data, rnn->last_c_desc(), last_c_data,
+        workspace_data->data<uint8_t>(), workspace_size));
+  } else {
+#if CUDNN_VERSION >= 7201
+    // for inference
+    // This interface is used when the input/output is padded.
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInferenceEx(
+        handle, rnn->rnn_desc(), rnn->x_seq_desc(), x_data, rnn->init_h_desc(),
+        init_h_data, rnn->init_c_desc(), init_c_data, rnn->weight_desc(),
+        w_data, rnn->y_seq_desc(), out_data, rnn->last_h_desc(), last_h_data,
+        rnn->last_c_desc(), last_c_data, nullptr, nullptr, nullptr, nullptr,
+        nullptr, nullptr, nullptr, nullptr, workspace_data->data<uint8_t>(),
+        workspace_size));
+#else
+    // CUDNN VERSION has to >=7.2.1
+    PADDLE_THROW(platform::errors::Unavailable(
+        "The padded input is supported by "
+        "cudnnRNNForwardInferenceEx, but it only works when "
+        "the version of cudnn is larger than 7.2.1"));
+#endif
+  }
+}
+
 template <typename T>
 class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
  public:
@@ -56,7 +94,13 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     int num_layers = ctx.Attr<int>("num_layers");
     bool is_test = ctx.Attr<bool>("is_test");
     int seed = ctx.Attr<int>("seed");
-    auto sequence_length = ctx.Attr<std::vector<int>>("sequence_length");
+
+    bool has_seq_length = ctx.HasInput("SequenceLength");
+    std::vector<int> SequenceLength;
+    if (has_seq_length) {
+      auto *sequence_length = ctx.Input<Tensor>("SequenceLength");
+      SequenceLength = operators::GetDataFromTensor<int>(sequence_length);
+    }
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
@@ -70,58 +114,32 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     size_t workspace_size;
     size_t reserve_size;
 
-    platform::ScopedRNNBase rnn(seq_length, batch_size, input_size, hidden_size,
-                                num_layers, dropout_prob, seed, weight_numel,
-                                state_initialized, is_bidirec);
-    rnn.Create<T>(handle, ctx.GetPlace(), sequence_length, &workspace_size,
+    ScopedRNNBase rnn(seq_length, batch_size, input_size, hidden_size,
+                      num_layers, dropout_prob, seed, weight_numel,
+                      state_initialized, is_bidirec);
+    rnn.Create<T>(handle, ctx.GetPlace(), SequenceLength, &workspace_size,
                   &reserve_size, state_out);
 
     framework::Tensor workspace_data_;
-    workspace_data_.Resize({static_cast<int64_t>(workspace_size)});
-    workspace_data_.mutable_data<uint8_t>(ctx.GetPlace());
+    workspace_data_.mutable_data<uint8_t>(
+        {static_cast<int64_t>(workspace_size)}, ctx.GetPlace());
 
     auto *reserve_data = reserve->mutable_data<uint8_t>(
         {static_cast<int64_t>(reserve_size)}, ctx.GetPlace());
 
     if (is_test) {
-      if (sequence_length.empty()) {
-        // for inference
-        // This interface is used when the input/output is unpadded.
-        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference(
-            handle, rnn.rnn_desc(), seq_length, rnn.x_desc(), x_data,
-            rnn.hx_desc(), init_h_data, rnn.cx_desc(), init_c_data,
-            rnn.w_desc(), w_data, rnn.y_desc(), out_data, rnn.hy_desc(),
-            last_h_data, rnn.cy_desc(), last_c_data,
-            workspace_data_.data<uint8_t>(), workspace_size));
-      } else {
-#if CUDNN_VERSION >= 7201
-        // for inference
-        // This interface is used when the input/output is padded.
-        PADDLE_ENFORCE_CUDA_SUCCESS(
-            platform::dynload::cudnnRNNForwardInferenceEx(
-                handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data, rnn.hx_desc(),
-                init_h_data, rnn.cx_desc(), init_c_data, rnn.w_desc(), w_data,
-                rnn.y_seq_desc(), out_data, rnn.hy_desc(), last_h_data,
-                rnn.cy_desc(), last_c_data, nullptr, nullptr, nullptr, nullptr,
-                nullptr, nullptr, nullptr, nullptr,
-                workspace_data_.data<uint8_t>(), workspace_size));
-#else
-        PADDLE_ENFORCE_NOT_NULL(
-            nullptr, platform::errors::Unavailable(
-                         "The padded input is supported by "
-                         "cudnnRNNForwardInferenceEx, but it only works when "
-                         "the version of cudnn is larger than 7.2.1"));
-#endif
-      }
+      LSTMInferece<T>(has_seq_length, handle, seq_length, &rnn, x_data,
+                      init_h_data, init_c_data, w_data, out_data, last_h_data,
+                      last_c_data, &workspace_data_, workspace_size);
     } else {
-      if (sequence_length.empty()) {
+      if (!has_seq_length) {
         // for train
         // This interface is used when the input/output is unpadded.
         PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardTraining(
-            handle, rnn.rnn_desc(), seq_length, rnn.x_desc(), x_data,
-            rnn.hx_desc(), init_h_data, rnn.cx_desc(), init_c_data,
-            rnn.w_desc(), w_data, rnn.y_desc(), out_data, rnn.hy_desc(),
-            last_h_data, rnn.cy_desc(), last_c_data,
+            handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), x_data,
+            rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
+            rnn.weight_desc(), w_data, rnn.y_descs(), out_data,
+            rnn.last_h_desc(), last_h_data, rnn.last_c_desc(), last_c_data,
             workspace_data_.data<uint8_t>(), workspace_size, reserve_data,
             reserve_size));
       } else {
@@ -130,19 +148,18 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
         // This interface is used when the input/output is padded.
         PADDLE_ENFORCE_CUDA_SUCCESS(
             platform::dynload::cudnnRNNForwardTrainingEx(
-                handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data, rnn.hx_desc(),
-                init_h_data, rnn.cx_desc(), init_c_data, rnn.w_desc(), w_data,
-                rnn.y_seq_desc(), out_data, rnn.hy_desc(), last_h_data,
-                rnn.cy_desc(), last_c_data, nullptr, nullptr, nullptr, nullptr,
-                nullptr, nullptr, nullptr, nullptr,
-                workspace_data_.data<uint8_t>(), workspace_size, reserve_data,
-                reserve_size));
+                handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data,
+                rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
+                rnn.weight_desc(), w_data, rnn.y_seq_desc(), out_data,
+                rnn.last_h_desc(), last_h_data, rnn.last_c_desc(), last_c_data,
+                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+                nullptr, workspace_data_.data<uint8_t>(), workspace_size,
+                reserve_data, reserve_size));
 #else
-        PADDLE_ENFORCE_NOT_NULL(
-            nullptr, platform::errors::Unavailable(
-                         "The padded input is supported by "
-                         "cudnnRNNForwardTrainingEx, but it only works when "
-                         "the version of cudnn is larger than 7.2.1"));
+        PADDLE_THROW(platform::errors::Unavailable(
+            "The padded input is supported by "
+            "cudnnRNNForwardTrainingEx, but it only works when "
+            "the version of cudnn is larger than 7.2.1"));
 #endif
       }
     }
@@ -203,7 +220,13 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     int hidden_size = ctx.Attr<int>("hidden_size");
     int num_layers = ctx.Attr<int>("num_layers");
     int seed = ctx.Attr<int>("seed");
-    auto sequence_length = ctx.Attr<std::vector<int>>("sequence_length");
+
+    bool has_seq_length = ctx.HasInput("SequenceLength");
+    std::vector<int> SequenceLength;
+    if (has_seq_length) {
+      auto *sequence_length = ctx.Input<Tensor>("SequenceLength");
+      SequenceLength = operators::GetDataFromTensor<int>(sequence_length);
+    }
 
     int seq_length = input_dims[0];
     int batch_size = input->dims()[1];
@@ -213,33 +236,33 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     size_t workspace_size;
     size_t reserve_size;
 
-    platform::ScopedRNNBase rnn(seq_length, batch_size, input_size, hidden_size,
-                                num_layers, dropout_prob, seed, weight_numel,
-                                true, is_bidirec);
+    ScopedRNNBase rnn(seq_length, batch_size, input_size, hidden_size,
+                      num_layers, dropout_prob, seed, weight_numel, true,
+                      is_bidirec);
 
-    rnn.Create<T>(handle, ctx.GetPlace(), sequence_length, &workspace_size,
+    rnn.Create<T>(handle, ctx.GetPlace(), SequenceLength, &workspace_size,
                   &reserve_size, const_cast<Tensor *>(state_out));
 
     framework::Tensor workspace_data_;
-    workspace_data_.Resize({static_cast<int64_t>(workspace_size)});
-    workspace_data_.mutable_data<uint8_t>(ctx.GetPlace());
+    workspace_data_.mutable_data<uint8_t>(
+        {static_cast<int64_t>(workspace_size)}, ctx.GetPlace());
     const uint8_t *reserve_data = reserve->data<uint8_t>();
 
-    if (sequence_length.empty()) {
+    if (!has_seq_length) {
       // This interface is used when the input/output is unpadded.
       PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardData(
-          handle, rnn.rnn_desc(), seq_length, rnn.y_desc(), out_data,
-          rnn.y_desc(), out_grad_data, rnn.hy_desc(), last_h_grad_data,
-          rnn.cy_desc(), last_c_grad_data, rnn.w_desc(), weight_data,
-          rnn.hx_desc(), init_h_data, rnn.cx_desc(), init_c_data, rnn.x_desc(),
-          in_grad_data, rnn.hx_desc(), init_h_grad_data, rnn.cx_desc(),
-          init_c_grad_data, workspace_data_.data<uint8_t>(), workspace_size,
-          const_cast<uint8_t *>(reserve_data), reserve_size));
+          handle, rnn.rnn_desc(), seq_length, rnn.y_descs(), out_data,
+          rnn.y_descs(), out_grad_data, rnn.last_h_desc(), last_h_grad_data,
+          rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data,
+          rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
+          rnn.x_descs(), in_grad_data, rnn.init_h_desc(), init_h_grad_data,
+          rnn.init_c_desc(), init_c_grad_data, workspace_data_.data<uint8_t>(),
+          workspace_size, const_cast<uint8_t *>(reserve_data), reserve_size));
 
       PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeights(
-          handle, rnn.rnn_desc(), seq_length, rnn.x_desc(), input->data<T>(),
-          rnn.hx_desc(), init_h->data<T>(), rnn.y_desc(), out->data<T>(),
-          workspace_data_.data<uint8_t>(), workspace_size, rnn.w_desc(),
+          handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data<T>(),
+          rnn.init_h_desc(), init_h->data<T>(), rnn.y_descs(), out->data<T>(),
+          workspace_data_.data<uint8_t>(), workspace_size, rnn.weight_desc(),
           weight_grad->data<T>(), const_cast<uint8_t *>(reserve_data),
           reserve_size));
     } else {
@@ -248,27 +271,25 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
       // This interface is used when the input/output is padded.
       PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardDataEx(
           handle, rnn.rnn_desc(), rnn.y_seq_desc(), out_data, rnn.y_seq_desc(),
-          out_grad_data, nullptr, nullptr, rnn.hy_desc(), last_h_grad_data,
-          rnn.cy_desc(), last_c_grad_data, rnn.w_desc(), weight_data,
-          rnn.hx_desc(), init_h_data, rnn.cx_desc(), init_c_data,
-          rnn.x_seq_desc(), in_grad_data, rnn.hx_desc(), init_h_grad_data,
-          rnn.cx_desc(), init_c_grad_data, nullptr, nullptr,
+          out_grad_data, nullptr, nullptr, rnn.last_h_desc(), last_h_grad_data,
+          rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data,
+          rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
+          rnn.x_seq_desc(), in_grad_data, rnn.init_h_desc(), init_h_grad_data,
+          rnn.init_c_desc(), init_c_grad_data, nullptr, nullptr,
           workspace_data_.data<uint8_t>(), workspace_size,
           const_cast<uint8_t *>(reserve_data), reserve_size));
 
       PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeightsEx(
           handle, rnn.rnn_desc(), rnn.x_seq_desc(), input->data<T>(),
-          rnn.hx_desc(), init_h->data<T>(), rnn.y_seq_desc(), out->data<T>(),
-          workspace_data_.data<uint8_t>(), workspace_size, rnn.w_desc(),
-          weight_grad->data<T>(), const_cast<uint8_t *>(reserve_data),
-          reserve_size));
+          rnn.init_h_desc(), init_h->data<T>(), rnn.y_seq_desc(),
+          out->data<T>(), workspace_data_.data<uint8_t>(), workspace_size,
+          rnn.weight_desc(), weight_grad->data<T>(),
+          const_cast<uint8_t *>(reserve_data), reserve_size));
 #else
-      PADDLE_ENFORCE_NOT_NULL(
-          nullptr,
-          platform::errors::Unavailable(
-              "The padded input of rnn is supported by cudnnRNNBackwardDataEx, "
-              "cudnnRNNBackwardWeightsEx, but it only works when the version "
-              "of cudnn is larger than 7.2.1"));
+      PADDLE_THROW(platform::errors::Unavailable(
+          "The padded input of rnn is supported by cudnnRNNBackwardDataEx, "
+          "cudnnRNNBackwardWeightsEx, but it only works when the version "
+          "of cudnn is larger than 7.2.1"));
 #endif
     }
   }
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 537063640e4ef6e49f7b991482f0f3122ecef02f..c2b7c27ab4adb5282ad7aa5f7a16c15f81ba5f5e 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -41,9 +41,13 @@ detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_fo
 detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc)
 
 if(WITH_GPU)
-  detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
-  detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS memory cub)
-  detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc collect_fpn_proposals_op.cu DEPS memory cub)
+  set(TMPDEPS memory)
+  if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+      set(TMPDEPS memory cub)
+  endif()
+  detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS ${TMPDEPS})
+  detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS ${TMPDEPS})
+  detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc collect_fpn_proposals_op.cu DEPS ${TMPDEPS})
 else()
   detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
   detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc)
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
index b3e3332fe3425301a3dede3a3d810697ad4debf3..44f602237da2e2c8fa26e39326f977d10235155d 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License.*/
 
 #include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -54,11 +55,14 @@ class CollectFpnProposalsOp : public framework::OperatorWithKernel {
               score_dim[1]));
     }
     context->SetOutputDim("FpnRois", {post_nms_topN, 4});
+    if (context->HasOutput("RoisNum")) {
+      context->SetOutputDim("RoisNum", {-1});
+    }
     if (!context->IsRuntime()) {  // Runtime LoD infershape will be computed
       // in Kernel.
       context->ShareLoD("MultiLevelRois", "FpnRois");
     }
-    if (context->IsRuntime()) {
+    if (context->IsRuntime() && !context->HasInputs("MultiLevelRoIsNum")) {
       std::vector<framework::InferShapeVarPtr> roi_inputs =
           context->GetInputVarPtrs("MultiLevelRois");
       std::vector<framework::InferShapeVarPtr> score_inputs =
@@ -99,7 +103,16 @@ class CollectFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
              "(LoDTensor) Multiple score LoDTensors from each level in shape"
              " (N, 1), N is the number of RoIs.")
         .AsDuplicable();
+    AddInput(
+        "MultiLevelRoIsNum",
+        "(List of Tensor) The RoIs' number of each image on multiple levels."
+        "The number on each level has the shape of (N), N is the number of "
+        "images.")
+        .AsDuplicable()
+        .AsDispensable();
     AddOutput("FpnRois", "(LoDTensor) All selected RoIs with highest scores");
+    AddOutput("RoisNum", "(Tensor), Number of RoIs in each images.")
+        .AsDispensable();
     AddAttr<int>("post_nms_topN",
                  "Select post_nms_topN RoIs from"
                  " all images and all fpn layers");
@@ -123,3 +136,14 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(collect_fpn_proposals,
                        ops::CollectFpnProposalsOpKernel<float>,
                        ops::CollectFpnProposalsOpKernel<double>);
+REGISTER_OP_VERSION(collect_fpn_proposals)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade collect_fpn_proposals add a new input 
+              [MultiLevelRoIsNum] and add a new output [RoisNum].)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewInput("MultiLevelRoIsNum",
+                      "The RoIs' number of each image on multiple levels."
+                      "The number on each level has the shape of (N), "
+                      "N is the number of images.")
+            .NewOutput("RoisNum", "The number of RoIs in each image."));
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index 35222a85cd388f6fef3c61c440be7b36598d9e01..86207052bb2bef4f7bea34c2614fe7686f579de8 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -80,14 +80,27 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     int lod_size;
     auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
 
+    auto multi_rois_num = ctx.MultiInput<Tensor>("MultiLevelRoIsNum");
     for (size_t i = 0; i < roi_ins.size(); ++i) {
       auto roi_in = roi_ins[i];
       auto score_in = score_ins[i];
-      auto roi_lod = roi_in->lod().back();
-      lod_size = roi_lod.size() - 1;
-      for (size_t n = 0; n < lod_size; ++n) {
-        for (size_t j = roi_lod[n]; j < roi_lod[n + 1]; ++j) {
-          roi_batch_id_data[index++] = n;
+      if (multi_rois_num.size() > 0) {
+        framework::Tensor temp;
+        TensorCopySync(*multi_rois_num[i], platform::CPUPlace(), &temp);
+        const int* length_in = temp.data<int>();
+        lod_size = multi_rois_num[i]->numel();
+        for (size_t n = 0; n < lod_size; ++n) {
+          for (size_t j = 0; j < length_in[n]; ++j) {
+            roi_batch_id_data[index++] = n;
+          }
+        }
+      } else {
+        auto length_in = roi_in->lod().back();
+        lod_size = length_in.size() - 1;
+        for (size_t n = 0; n < lod_size; ++n) {
+          for (size_t j = length_in[n]; j < length_in[n + 1]; ++j) {
+            roi_batch_id_data[index++] = n;
+          }
         }
       }
 
@@ -190,6 +203,13 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
       offset.emplace_back(offset.back() + length_lod_cpu[i]);
     }
 
+    if (ctx.HasOutput("RoisNum")) {
+      auto* rois_num = ctx.Output<Tensor>("RoisNum");
+      int* rois_num_data = rois_num->mutable_data<int>({lod_size}, place);
+      memory::Copy(place, rois_num_data, place, length_lod_data,
+                   lod_size * sizeof(int), dev_ctx.stream());
+    }
+
     framework::LoD lod;
     lod.emplace_back(offset);
     fpn_rois->set_lod(lod);
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
index badd88f0689ba9defcb3f26eb57fef89308aa877..950b8b78933bff6bf1692df61142258dfbc87a8c 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
@@ -17,6 +17,7 @@ limitations under the License.*/
 #include <algorithm>
 #include <cmath>
 #include <cstring>
+#include <numeric>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
@@ -65,6 +66,8 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
 
     auto multi_layer_scores =
         context.MultiInput<paddle::framework::LoDTensor>("MultiLevelScores");
+    auto multi_rois_num = context.MultiInput<Tensor>("MultiLevelRoIsNum");
+    int num_size = multi_rois_num.size();
 
     auto* fpn_rois = context.Output<paddle::framework::LoDTensor>("FpnRois");
 
@@ -88,11 +91,21 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     const int num_fpn_level = multi_layer_rois.size();
     std::vector<int> integral_of_all_rois(num_fpn_level + 1, 0);
     for (int i = 0; i < num_fpn_level; ++i) {
-      auto cur_rois_lod = multi_layer_rois[i]->lod().back();
-      integral_of_all_rois[i + 1] =
-          integral_of_all_rois[i] + cur_rois_lod[cur_rois_lod.size() - 1];
+      int all_rois = 0;
+      if (num_size == 0) {
+        auto cur_rois_lod = multi_layer_rois[i]->lod().back();
+        all_rois = cur_rois_lod[cur_rois_lod.size() - 1];
+      } else {
+        const int* cur_rois_num = multi_rois_num[i]->data<int>();
+        all_rois = std::accumulate(
+            cur_rois_num, cur_rois_num + multi_rois_num[i]->numel(), 0);
+      }
+      integral_of_all_rois[i + 1] = integral_of_all_rois[i] + all_rois;
     }
 
+    const int batch_size = (num_size == 0)
+                               ? multi_layer_rois[0]->lod().back().size() - 1
+                               : multi_rois_num[0]->numel();
     // concatenate all fpn rois scores into a list
     // create a vector to store all scores
     std::vector<ScoreWithID<T>> scores_of_all_rois(
@@ -100,11 +113,20 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     for (int i = 0; i < num_fpn_level; ++i) {
       const T* cur_level_scores = multi_layer_scores[i]->data<T>();
       int cur_level_num = integral_of_all_rois[i + 1] - integral_of_all_rois[i];
-      auto cur_scores_lod = multi_layer_scores[i]->lod().back();
       int cur_batch_id = 0;
+      int pre_num = 0;
       for (int j = 0; j < cur_level_num; ++j) {
-        if (static_cast<size_t>(j) >= cur_scores_lod[cur_batch_id + 1]) {
-          cur_batch_id++;
+        if (num_size == 0) {
+          auto cur_scores_lod = multi_layer_scores[i]->lod().back();
+          if (static_cast<size_t>(j) >= cur_scores_lod[cur_batch_id + 1]) {
+            cur_batch_id++;
+          }
+        } else {
+          const int* rois_num_data = multi_rois_num[i]->data<int>();
+          if (j >= pre_num + rois_num_data[cur_batch_id]) {
+            pre_num += rois_num_data[cur_batch_id];
+            cur_batch_id++;
+          }
         }
         int cur_index = j + integral_of_all_rois[i];
         scores_of_all_rois[cur_index].score = cur_level_scores[j];
@@ -134,6 +156,9 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     T* fpn_rois_data = fpn_rois->data<T>();
     std::vector<size_t> lod0(1, 0);
     int cur_batch_id = 0;
+    std::vector<int64_t> num_per_batch;
+    int pre_idx = 0;
+    int cur_num = 0;
     for (int i = 0; i < post_nms_topN; ++i) {
       int cur_fpn_level = scores_of_all_rois[i].level;
       int cur_level_index = scores_of_all_rois[i].index;
@@ -144,6 +169,18 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
       if (scores_of_all_rois[i].batch_id != cur_batch_id) {
         cur_batch_id = scores_of_all_rois[i].batch_id;
         lod0.emplace_back(i);
+        cur_num = i - pre_idx;
+        pre_idx = i;
+        num_per_batch.emplace_back(cur_num);
+      }
+    }
+    num_per_batch.emplace_back(post_nms_topN - pre_idx);
+    if (context.HasOutput("RoisNum")) {
+      auto* rois_num = context.Output<Tensor>("RoisNum");
+      int* rois_num_data =
+          rois_num->mutable_data<int>({batch_size}, context.GetPlace());
+      for (int i = 0; i < batch_size; i++) {
+        rois_num_data[i] = num_per_batch[i];
       }
     }
     lod0.emplace_back(post_nms_topN);
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
index 160d43a917b3c74ff905e070714415d35c5c877c..614b37e703e721337057e04c5611386ff87a1e9e 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -48,6 +49,14 @@ class DistributeFpnProposalsOp : public framework::OperatorWithKernel {
     }
     ctx->SetOutputsDim("MultiFpnRois", outs_dims);
     ctx->SetOutputDim("RestoreIndex", {-1, 1});
+
+    if (ctx->HasOutputs("MultiLevelRoIsNum")) {
+      std::vector<framework::DDim> outs_num_dims;
+      for (size_t i = 0; i < num_out_rois; ++i) {
+        outs_num_dims.push_back({-1});
+      }
+      ctx->SetOutputsDim("MultiLevelRoIsNum", outs_num_dims);
+    }
     if (!ctx->IsRuntime()) {
       for (size_t i = 0; i < num_out_rois; ++i) {
         ctx->SetLoDLevel("MultiFpnRois", ctx->GetLoDLevel("FpnRois"), i);
@@ -66,12 +75,22 @@ class DistributeFpnProposalsOp : public framework::OperatorWithKernel {
 class DistributeFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("FpnRois", "(LoDTensor) The rois at all levels in shape (-1, 4)");
+    AddInput("FpnRois", "(LoDTensor) The RoIs at all levels in shape (-1, 4)");
+    AddInput("RoisNum",
+             "(Tensor) The number of RoIs in shape (B),"
+             "B is the number of images")
+        .AsDispensable();
     AddOutput("MultiFpnRois", "(LoDTensor) Output with distribute operator")
         .AsDuplicable();
     AddOutput("RestoreIndex",
               "(Tensor) An array of positive number which is "
               "used to restore the order of FpnRois");
+    AddOutput("MultiLevelRoIsNum",
+              "(List of Tensor) The RoIs' number of each image on multiple "
+              "levels. The number on each level has the shape of (B),"
+              "B is the number of images.")
+        .AsDuplicable()
+        .AsDispensable();
     AddAttr<int>("min_level",
                  "The lowest level of FPN layer where the"
                  " proposals come from");
@@ -105,3 +124,14 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(distribute_fpn_proposals,
                        ops::DistributeFpnProposalsOpKernel<float>,
                        ops::DistributeFpnProposalsOpKernel<double>);
+REGISTER_OP_VERSION(distribute_fpn_proposals)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade distribute_fpn_proposals add a new input
+              [RoisNum] and add a new output [MultiLevelRoIsNum].)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewInput("RoIsNum", "The number of RoIs in each image.")
+            .NewOutput("MultiLevelRoisNum",
+                       "The RoIs' number of each image on multiple "
+                       "levels. The number on each level has the shape of (B),"
+                       "B is the number of images."));
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index 1e3cd9f36c595f978f5b5e5f5c5cf5cad6dc9059..27c06a0f8fb207b5dc85c7875ea91428b16e606c 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -76,12 +76,20 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     int num_level = max_level - min_level + 1;
 
     // check that the fpn_rois is not empty
-    PADDLE_ENFORCE_EQ(
-        fpn_rois->lod().size(), 1UL,
-        platform::errors::InvalidArgument("DistributeFpnProposalsOp needs LoD"
-                                          "with one level"));
+    if (!ctx.HasInput("RoisNum")) {
+      PADDLE_ENFORCE_EQ(
+          fpn_rois->lod().size(), 1UL,
+          platform::errors::InvalidArgument("DistributeFpnProposalsOp needs LoD"
+                                            "with one level"));
+    }
 
-    auto fpn_rois_lod = fpn_rois->lod().back();
+    std::vector<size_t> fpn_rois_lod;
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num = ctx.Input<Tensor>("RoisNum");
+      fpn_rois_lod = GetLodFromRoisNum(rois_num);
+    } else {
+      fpn_rois_lod = fpn_rois->lod().back();
+    }
     int lod_size = fpn_rois_lod.size() - 1;
     int roi_num = fpn_rois_lod[lod_size];
 
@@ -154,6 +162,8 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
         restore_idx_data, roi_num);
 
     int start = 0;
+    auto multi_rois_num = ctx.MultiOutput<Tensor>("MultiLevelRoIsNum");
+
     for (int i = 0; i < num_level; ++i) {
       Tensor sub_lod = sub_lod_list.Slice(i, i + 1);
       int* sub_lod_data = sub_lod.data<int>();
@@ -180,6 +190,11 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
         multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
                                            dev_ctx.GetPlace());
       }
+      if (multi_rois_num.size() > 0) {
+        Tensor* rois_num_t = multi_rois_num[i];
+        TensorCopySync(sub_lod, dev_ctx.GetPlace(), rois_num_t);
+        rois_num_t->Resize({lod_size});
+      }
       framework::LoD lod;
       lod.emplace_back(offset);
       multi_fpn_rois[i]->set_lod(lod);
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
index 0c84b385ccbc1dd26453bd957661c0310b7137e3..79498f01536d2fb2616921a2ef1ffa04f13fae64 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
@@ -28,6 +28,21 @@ namespace operators {
 
 const int kBoxDim = 4;
 
+inline std::vector<size_t> GetLodFromRoisNum(const Tensor* rois_num) {
+  std::vector<size_t> rois_lod;
+  auto* rois_num_data = rois_num->data<int>();
+  Tensor cpu_tensor;
+  if (platform::is_gpu_place(rois_num->place())) {
+    TensorCopySync(*rois_num, platform::CPUPlace(), &cpu_tensor);
+    rois_num_data = cpu_tensor.data<int>();
+  }
+  rois_lod.push_back(static_cast<size_t>(0));
+  for (int i = 0; i < rois_num->numel(); ++i) {
+    rois_lod.push_back(rois_lod.back() + static_cast<size_t>(rois_num_data[i]));
+  }
+  return rois_lod;
+}
+
 template <typename T>
 static inline T BBoxArea(const T* box, bool normalized) {
   if (box[2] < box[0] || box[3] < box[1]) {
@@ -65,13 +80,22 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     const int num_level = max_level - min_level + 1;
 
     // check that the fpn_rois is not empty
-    PADDLE_ENFORCE_EQ(
-        fpn_rois->lod().size(), 1UL,
-        platform::errors::InvalidArgument("DistributeFpnProposalsOp needs LoD "
-                                          "with one level."));
+    if (!context.HasInput("RoisNum")) {
+      PADDLE_ENFORCE_EQ(fpn_rois->lod().size(), 1UL,
+                        platform::errors::InvalidArgument(
+                            "DistributeFpnProposalsOp needs LoD "
+                            "with one level."));
+    }
 
-    auto fpn_rois_lod = fpn_rois->lod().back();
-    int fpn_rois_num = fpn_rois_lod[fpn_rois_lod.size() - 1];
+    std::vector<size_t> fpn_rois_lod;
+    int fpn_rois_num;
+    if (context.HasInput("RoisNum")) {
+      auto* rois_num = context.Input<Tensor>("RoisNum");
+      fpn_rois_lod = GetLodFromRoisNum(rois_num);
+    } else {
+      fpn_rois_lod = fpn_rois->lod().back();
+    }
+    fpn_rois_num = fpn_rois_lod[fpn_rois_lod.size() - 1];
     std::vector<int> target_level;
     // std::vector<int> target_level(fpn_rois_num, -1);
     // record the number of rois in each level
@@ -136,6 +160,18 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     for (int i = 0; i < fpn_rois_num; ++i) {
       restore_index_data[restore_index_inter[i]] = i;
     }
+    auto multi_rois_num = context.MultiOutput<Tensor>("MultiLevelRoIsNum");
+    if (multi_rois_num.size() > 0) {
+      int batch_size = fpn_rois_lod.size() - 1;
+      for (int i = 0; i < num_level; ++i) {
+        int* rois_num_data = multi_rois_num[i]->mutable_data<int>(
+            {batch_size}, context.GetPlace());
+        for (int j = 0; j < batch_size; ++j) {
+          rois_num_data[j] = static_cast<int>(multi_fpn_rois_lod0[i][j + 1] -
+                                              multi_fpn_rois_lod0[i][j]);
+        }
+      }
+    }
     // merge lod information into LoDTensor
     for (int i = 0; i < num_level; ++i) {
       framework::LoD lod;
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index 981a368e8564fbcd3d688bc67d2def8664bcfe8d..06e560f86d4e0a74f7ae04b155829618ce634697 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -61,6 +62,10 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
 
     ctx->SetOutputDim("RpnRois", {-1, 4});
     ctx->SetOutputDim("RpnRoiProbs", {-1, 1});
+    if (!ctx->IsRuntime()) {
+      ctx->SetLoDLevel("RpnRois", std::max(ctx->GetLoDLevel("Scores"), 1));
+      ctx->SetLoDLevel("RpnRoiProbs", std::max(ctx->GetLoDLevel("Scores"), 1));
+    }
   }
 
  protected:
@@ -347,7 +352,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     lod0.push_back(0);
     anchors.Resize({anchors.numel() / 4, 4});
     variances.Resize({variances.numel() / 4, 4});
-    std::vector<int64_t> tmp_lod;
+    std::vector<int> tmp_num;
 
     int64_t num_proposals = 0;
     for (int64_t i = 0; i < num; ++i) {
@@ -369,16 +374,16 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
       AppendProposals(rpn_roi_probs, num_proposals, scores);
       num_proposals += proposals.dims()[0];
       lod0.push_back(num_proposals);
-      tmp_lod.push_back(num_proposals);
+      tmp_num.push_back(proposals.dims()[0]);
     }
-    if (context.HasOutput("RpnRoisLod")) {
-      auto *rpn_rois_lod = context.Output<Tensor>("RpnRoisLod");
-      rpn_rois_lod->mutable_data<int64_t>({num}, context.GetPlace());
-      int64_t *lod_data = rpn_rois_lod->data<int64_t>();
+    if (context.HasOutput("RpnRoisNum")) {
+      auto *rpn_rois_num = context.Output<Tensor>("RpnRoisNum");
+      rpn_rois_num->mutable_data<int>({num}, context.GetPlace());
+      int *num_data = rpn_rois_num->data<int>();
       for (int i = 0; i < num; i++) {
-        lod_data[i] = tmp_lod[i];
+        num_data[i] = tmp_num[i];
       }
-      rpn_rois_lod->Resize({num});
+      rpn_rois_num->Resize({num});
     }
     rpn_rois->set_lod(lod);
     rpn_roi_probs->set_lod(lod);
@@ -433,6 +438,16 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
 
     Tensor keep;
     FilterBoxes<T>(ctx, &proposals, min_size, im_info_slice, &keep);
+    // Handle the case when there is no keep index left
+    if (keep.numel() == 0) {
+      math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+      bbox_sel.mutable_data<T>({1, 4}, ctx.GetPlace());
+      set_zero(ctx, &bbox_sel, static_cast<T>(0));
+      Tensor scores_filter;
+      scores_filter.mutable_data<T>({1, 1}, ctx.GetPlace());
+      set_zero(ctx, &scores_filter, static_cast<T>(0));
+      return std::make_pair(bbox_sel, scores_filter);
+    }
 
     Tensor scores_filter;
     bbox_sel.mutable_data<T>({keep.numel(), 4}, ctx.GetPlace());
@@ -481,7 +496,8 @@ class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
               "(LoDTensor), Output proposals with shape (rois_num, 4).");
     AddOutput("RpnRoiProbs",
               "(LoDTensor) Scores of proposals with shape (rois_num, 1).");
-    AddOutput("RpnRoisLod", "(Tensor), rpn rois's lod info").AsDispensable();
+    AddOutput("RpnRoisNum", "(Tensor), The number of Rpn RoIs in each image")
+        .AsDispensable();
     AddAttr<int>("pre_nms_topN",
                  "Number of top scoring RPN proposals to keep before "
                  "applying NMS.");
@@ -515,3 +531,11 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(generate_proposals, ops::GenerateProposalsKernel<float>,
                        ops::GenerateProposalsKernel<double>);
+REGISTER_OP_VERSION(generate_proposals)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade generate_proposals add a new output [RpnRoisNum])ROC",
+        paddle::framework::compatible::OpVersionDesc().NewOutput(
+            "RpnRoisNum",
+            "The number of Rpn RoIs in each image. RpnRoisNum is "
+            "dispensable."));
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index fa7670f6d680a95da1c1abd5befe1651ccb7265f..485136d8e2f7ab66f6b1c58deb09036ea5d4e1ec 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -330,6 +330,15 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   keep_index.Resize({keep_num});
 
   Tensor scores_filter, proposals_filter;
+  // Handle the case when there is no keep index left
+  if (keep_num == 0) {
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    proposals_filter.mutable_data<T>({1, 4}, ctx.GetPlace());
+    scores_filter.mutable_data<T>({1, 1}, ctx.GetPlace());
+    set_zero(ctx, &proposals_filter, static_cast<T>(0));
+    set_zero(ctx, &scores_filter, static_cast<T>(0));
+    return std::make_pair(proposals_filter, scores_filter);
+  }
   proposals_filter.mutable_data<T>({keep_num, 4}, ctx.GetPlace());
   scores_filter.mutable_data<T>({keep_num, 1}, ctx.GetPlace());
   GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
@@ -421,7 +430,7 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
 
     int64_t num_proposals = 0;
     std::vector<size_t> offset(1, 0);
-    std::vector<int64_t> tmp_lod;
+    std::vector<int> tmp_num;
 
     for (int64_t i = 0; i < num; ++i) {
       Tensor im_info_slice = im_info->Slice(i, i + 1);
@@ -448,15 +457,15 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
       dev_ctx.Wait();
       num_proposals += proposals.dims()[0];
       offset.emplace_back(num_proposals);
-      tmp_lod.push_back(num_proposals);
+      tmp_num.push_back(proposals.dims()[0]);
     }
-    if (context.HasOutput("RpnRoisLod")) {
-      auto *rpn_rois_lod = context.Output<Tensor>("RpnRoisLod");
-      rpn_rois_lod->mutable_data<int64_t>({num}, context.GetPlace());
-      int64_t *lod_data = rpn_rois_lod->data<int64_t>();
-      memory::Copy(place, lod_data, cpu_place, &tmp_lod[0],
-                   sizeof(int64_t) * num, dev_ctx.stream());
-      rpn_rois_lod->Resize({num});
+    if (context.HasOutput("RpnRoisNum")) {
+      auto *rpn_rois_num = context.Output<Tensor>("RpnRoisNum");
+      rpn_rois_num->mutable_data<int>({num}, context.GetPlace());
+      int *num_data = rpn_rois_num->data<int>();
+      memory::Copy(place, num_data, cpu_place, &tmp_num[0], sizeof(int) * num,
+                   dev_ctx.stream());
+      rpn_rois_num->Resize({num});
     }
     framework::LoD lod;
     lod.emplace_back(offset);
diff --git a/paddle/fluid/operators/dist_op.h b/paddle/fluid/operators/dist_op.h
index ca03400cfd1ef9a27ba8e725381515d5e4ebc0ba..a2279e40623b4ba2f0421e73a6148b89eb970e71 100644
--- a/paddle/fluid/operators/dist_op.h
+++ b/paddle/fluid/operators/dist_op.h
@@ -176,14 +176,26 @@ static void DistGradFunction(const framework::ExecutionContext& context) {
   } else if (p == INFINITY || p == -INFINITY) {
     // p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if
     // j!=i, or equals to sign(z_i) * dout if j=i.
-    grad_t.device(place) =
-        (x_minux_y_abs == out_t.broadcast(out_bcast_dims)).template cast<T>() *
-        sign * out_grad_t.broadcast(out_bcast_dims);
+    if (platform::is_cpu_place(context.GetPlace())) {
+      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
+                                 .template cast<T>() *
+                             sign.eval() * out_grad_t.broadcast(out_bcast_dims);
+    } else {
+      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
+                                 .template cast<T>() *
+                             sign * out_grad_t.broadcast(out_bcast_dims);
+    }
   } else {
     // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout
-    grad_t.device(place) =
-        (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * sign *
-        out_grad_t.broadcast(out_bcast_dims);
+    if (platform::is_cpu_place(context.GetPlace())) {
+      grad_t.device(place) =
+          (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) *
+          sign.eval() * out_grad_t.broadcast(out_bcast_dims);
+    } else {
+      grad_t.device(place) =
+          (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * sign *
+          out_grad_t.broadcast(out_bcast_dims);
+    }
   }
 
   Eigen::DSizes<int, Rank * 2> x_reshape_dims;
diff --git a/paddle/fluid/operators/distributed_ops/allreduce_op.h b/paddle/fluid/operators/distributed_ops/allreduce_op.h
index c77113ad405e991db20c035371550a1eccaa1971..e486faa575847311c2d668ada5519fe9c047f053 100644
--- a/paddle/fluid/operators/distributed_ops/allreduce_op.h
+++ b/paddle/fluid/operators/distributed_ops/allreduce_op.h
@@ -76,7 +76,8 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
     }
 #else
-    PADDLE_THROW("PaddlePaddle should compile with GPU.");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
 #endif
   }
 };
diff --git a/paddle/fluid/operators/distributed_ops/broadcast_op.cc b/paddle/fluid/operators/distributed_ops/broadcast_op.cc
index 535cf7014419292863a684eaaebbf15d367671ab..61e27887b68c75f3d5c5cc48b4f1fac11d5f4eae 100644
--- a/paddle/fluid/operators/distributed_ops/broadcast_op.cc
+++ b/paddle/fluid/operators/distributed_ops/broadcast_op.cc
@@ -58,7 +58,8 @@ template <typename T>
 class BroadcastOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW("Broadcast op can run on gpu place only for now.");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "Broadcast op can run on gpu place only for now."));
   }
 };
 
diff --git a/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc b/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc
index f067840e539ac046b53be7d3bc83c783f7c8cf9c..337422f0bd643f131d5044e802851a09d6171c13 100644
--- a/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc
+++ b/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc
@@ -68,10 +68,11 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
             << " From " << root_dev_id << " to " << dev_id;
 
     if (ctx.Attr<bool>("sync_mode")) {
-      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
     }
 #else
-    PADDLE_THROW("PaddlePaddle should compile with GPU.");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
 #endif
   }
 };
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
index 457d9e79d7da171ef526d5cab0e59b021cb64f98..5a398fa50febe2efffd588ce8f3612f1f9cec0b6 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
@@ -49,8 +49,6 @@ REGISTER_OP_WITHOUT_GRADIENT(elementwise_floordiv, ops::ElementwiseOp,
 
 REGISTER_OP_CPU_KERNEL(
     elementwise_floordiv,
-    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, double>,
     ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext,
                                    int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
index f63d6f037632c1a6a05726b933b2258adc113ee3..60846d1e8fee1c7f68ac101f18355750c2c15a4d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
@@ -19,7 +19,5 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     elementwise_floordiv,
-    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, double>,
     ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int>,
     ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
index 8afe2133c0488bbe04ec4803aac5dce6573f634d..5dc93740949e6e7c25be564927c8fcffde1a18d6 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include <math.h>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
@@ -62,15 +61,8 @@ void elementwise_floor_div(const framework::ExecutionContext &ctx,
                            const framework::Tensor *x,
                            const framework::Tensor *y, framework::Tensor *z) {
   int axis = ctx.Attr<int>("axis");
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  if (x_dims.size() >= y_dims.size()) {
-    ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
-        ctx, x, y, axis, FloorDivFunctor<T>(), z);
-  } else {
-    ElementwiseComputeEx<InverseFloorDivFunctor<T>, DeviceContext, T>(
-        ctx, x, y, axis, InverseFloorDivFunctor<T>(), z);
-  }
+  ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
+      ctx, x, y, axis, FloorDivFunctor<T>(), z);
 }
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index 718321b441b2025afea9d913855b26a82cda8075..e4d3ea6d7291eff8911d8419cda96f2d2738b9a1 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -33,22 +33,7 @@ class ElementwiseMulOp : public ElementwiseOp {
     auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
 
 #ifdef PADDLE_WITH_MKLDNN
-    using mkldnn::memory;
-    auto CanMKLDNNElementwiseMulBeUsed = [&]() {
-      auto x_dims = ctx.Input<Tensor>("X")->dims();
-      auto y_dims = ctx.Input<Tensor>("Y")->dims();
-      int rankdiff = x_dims.size() - y_dims.size();
-      // TODO(jczaja): Remove this when oneDNN performance for scalar
-      // broadcasting
-      // is improved (Ernie large situation)
-      if (rankdiff != 0 && y_dims.size() == 1 && y_dims[0] == 1) {
-        return false;
-      }
-
-      return true;
-    };
-
-    if (platform::CanMKLDNNBeUsed(ctx) && CanMKLDNNElementwiseMulBeUsed()) {
+    if (platform::CanMKLDNNBeUsed(ctx)) {
       return framework::OpKernelType(input_data_type, ctx.GetPlace(),
                                      framework::DataLayout::kMKLDNN,
                                      framework::LibraryType::kMKLDNN);
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
index b8163169734bd2c64412bab7286aca9cc5e1b830..6ec8f2c2355ee098aed4a6b92410bcc60bca4736 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
@@ -33,9 +33,12 @@ namespace operators {
 static void Memcpy(void *dst, const void *src, size_t n, bool copy_to_gpu) {
   if (copy_to_gpu) {
 #ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE(cudaMemcpy(dst, src, n, cudaMemcpyHostToDevice));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaMemcpy(dst, src, n, cudaMemcpyHostToDevice));
 #else
-    PADDLE_THROW("Not compiled with cuda");
+    PADDLE_THROW(
+        platform::errors::InvalidArgument("Check your paddle version, current "
+                                          "version is not compiled with cuda"));
 #endif
   } else {
     std::memcpy(dst, src, n);
@@ -88,11 +91,22 @@ bool TestMain(const platform::Place &place, const framework::DDim &dims,
 
   framework::LoDTensor cpu_out;
   auto &out_tensor = scope.FindVar(out_name)->Get<framework::LoDTensor>();
-  PADDLE_ENFORCE(scope.kids().empty());
+  PADDLE_ENFORCE_EQ(scope.kids().empty(), true,
+                    platform::errors::InvalidArgument(
+                        "The scope can not have the child scopes,"
+                        "please check your code."));
   if (inplace) {
-    PADDLE_ENFORCE_EQ(&out_tensor, x);
+    PADDLE_ENFORCE_EQ(
+        &out_tensor, x,
+        platform::errors::InvalidArgument(
+            "The output tensor should be same as input x in inplace mode,"
+            " but now is not same."));
   } else {
-    PADDLE_ENFORCE_EQ(&out_tensor, z);
+    PADDLE_ENFORCE_EQ(
+        &out_tensor, z,
+        platform::errors::InvalidArgument(
+            "The output tensor should be same as output z in normal mode,"
+            " but now is not same."));
   }
 
   if (is_gpu_place) {
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h b/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
index 89849ef92cd19ff5f83f2b57c65c78610d7c2c69..54e7c7d1b6aa9776f5637359b334e6304d7906ce 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
+++ b/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
@@ -92,7 +92,9 @@ class TestElementwiseOpGradGrad {
         auto dst_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
         memory::Copy(dst_place, dst, src_place, src, bytes, nullptr);
 #else
-        PADDLE_THROW("Not compiled with cuda");
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Check your paddle version, current version is not compiled with "
+            "cuda"));
 #endif
       }
     }
@@ -107,7 +109,10 @@ class TestElementwiseOpGradGrad {
     op->Run(scope_, place_);
     platform::DeviceContextPool::Instance().Get(place_)->Wait();
     framework::LoDTensor cpu_out;
-    PADDLE_ENFORCE_EQ(scope_.kids().empty(), true, "scope has child scopes");
+    PADDLE_ENFORCE_EQ(scope_.kids().empty(), true,
+                      platform::errors::InvalidArgument(
+                          "The scope can not have the child scopes,"
+                          "please check your code."));
 
     // get outputs from scope and compare them with expected_outs
     bool all_equal = true;
diff --git a/paddle/fluid/operators/empty_op.cc b/paddle/fluid/operators/empty_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f539e2e6f6d2d6faa084d1e62ec894b4b65e96bf
--- /dev/null
+++ b/paddle/fluid/operators/empty_op.cc
@@ -0,0 +1,132 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/empty_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class EmptyOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("ShapeTensor",
+             "(Tensor<int>), optional). The shape of the output."
+             "It has a higher priority than Attr(shape).")
+        .AsDispensable();
+    AddInput("ShapeTensorList",
+             "(vector<Tensor<int>>, optional). The shape of the output. "
+             "It has a higher priority than Attr(shape)."
+             "The shape of the element in vector must be [1].")
+        .AsDuplicable()
+        .AsDispensable();
+    AddAttr<std::vector<int64_t>>("shape",
+                                  "(vector<int64_t>) The shape of the output")
+        .SetDefault({});
+    AddAttr<int>("dtype", "The data type of output tensor, Default is float")
+        .SetDefault(framework::proto::VarType::FP32);
+    AddOutput("Out", "(Tensor) The output tensor.");
+    AddComment(R"DOC(empty operator
+Returns a tensor filled with uninitialized data. The shape of the tensor is
+defined by the variable argument shape.
+
+
+The type of the tensor is specify by `dtype`.
+)DOC");
+  }
+};
+
+class EmptyOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* context) const override {
+    OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "empty");
+
+    if (context->HasInput("ShapeTensor")) {
+      auto dims = context->GetInputDim("ShapeTensor");
+      int num_ele = 1;
+      for (int i = 0; i < dims.size(); ++i) {
+        num_ele *= dims[i];
+      }
+
+      context->SetOutputDim("Out", framework::make_ddim({num_ele}));
+    } else if (context->HasInputs("ShapeTensorList")) {
+      std::vector<int> out_dims;
+      auto dims_list = context->GetInputsDim("ShapeTensorList");
+      for (size_t i = 0; i < dims_list.size(); ++i) {
+        auto& dims = dims_list[i];
+        PADDLE_ENFORCE_EQ(
+            dims, framework::make_ddim({1}),
+            "ShapeError: The shape of Tensor in list must be [1]. "
+            "But received the shape "
+            "is [%s]",
+            dims);
+
+        out_dims.push_back(dims[0]);
+      }
+
+      context->SetOutputDim("Out", framework::make_ddim(out_dims));
+    } else {
+      auto& shape = context->Attrs().Get<std::vector<int64_t>>("shape");
+      context->SetOutputDim("Out", framework::make_ddim(shape));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const framework::Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    if (var_name == "ShapeTensor" || var_name == "ShapeTensorList") {
+      return expected_kernel_type;
+    } else {
+      return framework::OpKernelType(expected_kernel_type.data_type_,
+                                     tensor.place(), tensor.layout());
+    }
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& context) const override {
+    return framework::OpKernelType(
+        framework::proto::VarType::Type(context.Attr<int>("dtype")),
+        context.GetPlace());
+  }
+};
+
+class EmptyOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext* context) const override {
+    auto data_type = static_cast<framework::proto::VarType::Type>(
+        BOOST_GET_CONST(int, context->GetAttr("dtype")));
+    context->SetOutputDataType("Out", data_type);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(
+    empty, ops::EmptyOp, ops::EmptyOpMaker, ops::EmptyOpVarTypeInference,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(empty, ops::EmptyKernel<plat::CPUDeviceContext, bool>,
+                       ops::EmptyKernel<plat::CPUDeviceContext, int>,
+                       ops::EmptyKernel<plat::CPUDeviceContext, int64_t>,
+                       ops::EmptyKernel<plat::CPUDeviceContext, float>,
+                       ops::EmptyKernel<plat::CPUDeviceContext, double>,
+                       ops::EmptyKernel<plat::CPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/empty_op.cu.cc b/paddle/fluid/operators/empty_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..22799e507aeff7940274f729b174f50bfd9132a5
--- /dev/null
+++ b/paddle/fluid/operators/empty_op.cu.cc
@@ -0,0 +1,26 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/empty_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    empty, ops::EmptyKernel<plat::CUDADeviceContext, bool>,
+    ops::EmptyKernel<plat::CUDADeviceContext, int>,
+    ops::EmptyKernel<plat::CUDADeviceContext, int64_t>,
+    ops::EmptyKernel<plat::CUDADeviceContext, float>,
+    ops::EmptyKernel<plat::CUDADeviceContext, double>,
+    ops::EmptyKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/empty_op.h b/paddle/fluid/operators/empty_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c91377683870917db28f6f6a5f3f3b1b4a1962f
--- /dev/null
+++ b/paddle/fluid/operators/empty_op.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/utils.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class EmptyKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto dtype = static_cast<framework::proto::VarType::Type>(
+        context.Attr<int>("dtype"));
+
+    Tensor *out_tensor = context.Output<Tensor>("Out");
+
+    auto shape = GetShape(context);
+    out_tensor->Resize(shape);
+
+    out_tensor->mutable_data(context.GetPlace(), dtype);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index 3c898ac29f0cab572d199eaafe951751682d4834..83e205367a7af62c52825297d92571c306be2c42 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -228,6 +228,26 @@ class ExpandGradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
+template <typename T>
+class ExpandDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    if (this->HasInput("expand_times_tensor")) {
+      op->SetInput("expand_times_tensor", this->Input("expand_times_tensor"));
+    }
+    if (this->HasInput("ExpandTimes")) {
+      op->SetInput("ExpandTimes", this->Input("ExpandTimes"));
+    }
+    op->SetAttrMap(this->Attrs());
+    op->SetType("expand");
+  }
+};
+
 DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandGradNoNeedBufVarsInferer, "X");
 
 }  // namespace operators
@@ -238,6 +258,8 @@ REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker,
                   ops::ExpandGradOpMaker<paddle::framework::OpDesc>,
                   ops::ExpandGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp,
+                  ops::ExpandDoubleGradOpMaker<paddle::framework::OpDesc>,
+                  ops::ExpandDoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::ExpandGradNoNeedBufVarsInferer);
 REGISTER_OP_CPU_KERNEL(
     expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc
index 359d512c341529579a56dbe840e5eef0aa3062a5..a1ee47b7f93910a481c6e0793c306e2b190c774d 100644
--- a/paddle/fluid/operators/expand_v2_op.cc
+++ b/paddle/fluid/operators/expand_v2_op.cc
@@ -230,6 +230,26 @@ class ExpandV2GradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
+template <typename T>
+class ExpandV2DoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("expand_v2");
+    op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    if (this->HasInput("expand_shapes_tensor")) {
+      op->SetInput("expand_shapes_tensor", this->Input("expand_shapes_tensor"));
+    }
+    if (this->HasInput("Shape")) {
+      op->SetInput("Shape", this->Input("Shape"));
+    }
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
 DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandV2GradNoNeedBufVarsInferer, "X");
 
 }  // namespace operators
@@ -240,6 +260,8 @@ REGISTER_OPERATOR(expand_v2, ops::ExpandV2Op, ops::ExpandV2OpMaker,
                   ops::ExpandV2GradOpMaker<paddle::framework::OpDesc>,
                   ops::ExpandV2GradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(expand_v2_grad, ops::ExpandV2GradOp,
+                  ops::ExpandV2DoubleGradOpMaker<paddle::framework::OpDesc>,
+                  ops::ExpandV2DoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::ExpandV2GradNoNeedBufVarsInferer);
 REGISTER_OP_CPU_KERNEL(
     expand_v2, ops::ExpandV2Kernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
index 74939da08b38dc147c156011759757a605db9444..6fea8fe98bf0e19bbbb023c91f4f9900f5ec1859 100644
--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
@@ -27,27 +27,6 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-inline framework::DDim GetShape(const framework::ExecutionContext &ctx,
-                                std::string op_type) {
-  // 1. shape is a Tensor
-  if (ctx.HasInput("ShapeTensor")) {
-    auto *shape_tensor = ctx.Input<framework::LoDTensor>("ShapeTensor");
-    auto vec_shape = GetDataFromTensor<int>(shape_tensor);
-    return framework::make_ddim(vec_shape);
-  }
-
-  // 2. shape is a list/tuple containing Tensor
-  auto shape_tensor_list = ctx.MultiInput<framework::Tensor>("ShapeTensorList");
-  if (shape_tensor_list.size() > 0) {
-    auto vec_shape = GetDataFromTensorList(shape_tensor_list);
-    return framework::make_ddim(vec_shape);
-  }
-
-  // 3. shape is a list/tuple without containing Tensor
-  auto vec_shape = ctx.Attr<std::vector<int64_t>>("shape");
-  return framework::make_ddim(vec_shape);
-}
-
 template <typename T>
 class FillConstantKernel : public framework::OpKernel<T> {
  public:
@@ -93,8 +72,7 @@ class FillConstantKernel : public framework::OpKernel<T> {
       }
       value = tensor_data[0];
     }
-    const std::string op_type = "fill_constant";
-    auto shape = GetShape(ctx, op_type);
+    auto shape = GetShape(ctx);
 
     if (out_var->IsType<framework::LoDTensor>()) {
       tensor = out_var->GetMutable<framework::LoDTensor>();
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
index c698cb1405fd6f049e01b23613e175ba39c4976e..79fa268f3884b2710fe08eb2907dbd989479d7e6 100644
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -367,8 +367,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
     auto blas = math::GetBlas<DeviceContext, T>(ctx);
 
     for (int64_t i = 0; i < ids_numel; ++i) {
-      PADDLE_ENFORCE_LT(ids_data[i], row_number);
-      PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i);
+      PADDLE_ENFORCE_LT(
+          ids_data[i], row_number,
+          platform::errors::OutOfRange(
+              "Value of Ids %d should less than dict size %d.", i, row_number));
+      PADDLE_ENFORCE_GE(ids_data[i], 0,
+                        platform::errors::OutOfRange(
+                            "Value of Ids %d should greater than ZERO.", i));
       memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width,
              row_width * sizeof(T));
     }
@@ -473,8 +478,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
 
     for (int64_t i = 0; i < ids_numel; ++i) {
-      PADDLE_ENFORCE_LT(ids_data[i], row_number);
-      PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i);
+      PADDLE_ENFORCE_LT(
+          ids_data[i], row_number,
+          platform::errors::OutOfRange(
+              "Value of Ids %d should less than dict size %d.", i, row_number));
+      PADDLE_ENFORCE_GE(ids_data[i], 0,
+                        platform::errors::OutOfRange(
+                            "Value of Ids %d should greater than ZERO.", i));
       memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width,
              row_width * sizeof(T));
     }
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index d0920098f606e49d4d1a3e4cb6d8a2b6c44ca267..4013906609603e31b798e333d55ecccba197506a 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -30,16 +30,18 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
   OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fusion_gru");
   OP_INOUT_CHECK(ctx->HasInput("WeightX"), "Input", "WeightX", "fusion_gru");
   OP_INOUT_CHECK(ctx->HasInput("WeightH"), "Input", "WeightH", "fusion_gru");
-
   OP_INOUT_CHECK(ctx->HasOutput("XX"), "Output", "XX", "fusion_gru");
   OP_INOUT_CHECK(ctx->HasOutput("Hidden"), "Output", "Hidden", "fusion_gru");
-
   auto x_dims = ctx->GetInputDim("X");
-  PADDLE_ENFORCE_EQ(x_dims.size(), 2,
-                    platform::errors::InvalidArgument(
-                        "Input(X)'s rank must be 2, but received input dim "
-                        "size is:%d, input dim is:[%s]",
-                        x_dims.size(), x_dims));
+  auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)
+                        ? framework::flatten_to_2d(x_dims, 1)
+                        : x_dims;
+  PADDLE_ENFORCE_EQ(
+      x_mat_dims.size(), 2,
+      platform::errors::InvalidArgument("The size of input X dims should be 2, "
+                                        "or 3 with second dimension equal to "
+                                        "1, but now Input X dim is:[%s] ",
+                                        x_dims));
 
   auto wx_dims = ctx->GetInputDim("WeightX");
   PADDLE_ENFORCE_EQ(wx_dims.size(), 2,
@@ -47,12 +49,14 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
                         "The rank of Input(WeightX) should be 2, but received "
                         "WeightX dim size is:%d, WeightX dim is:[%s] ",
                         wx_dims.size(), wx_dims));
-  PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1],
-                    platform::errors::InvalidArgument(
-                        "The first dimension of Input(WeightX) "
-                        "should equal to second dimension of input x, but "
-                        "received WeightX dimension is:%d, x dimension is:%d",
-                        wx_dims[0], x_dims[1]));
+  PADDLE_ENFORCE_EQ(
+      wx_dims[0], x_mat_dims[1],
+      platform::errors::InvalidArgument(
+          "The first dimension of flattened WeightX"
+          "should equal to last dimension of flattened input X, but "
+          "received fattened WeightX dimension is:%d, flattened X dimension "
+          "is:%d",
+          wx_dims[0], x_mat_dims[1]));
 
   int frame_size = wx_dims[1] / 3;
   auto wh_dims = ctx->GetInputDim("WeightH");
@@ -102,24 +106,24 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
                           "received bias dim is:[%s], frame size is:%d",
                           b_dims, frame_size));
   }
-  framework::DDim out_dims({x_dims[0], frame_size});
+  framework::DDim out_dims({x_mat_dims[0], frame_size});
   ctx->SetOutputDim("Hidden", out_dims);
   ctx->ShareLoD("X", "Hidden");
   int xx_width;
   if (ctx->Attrs().Get<bool>("use_seq")) {
     xx_width = wx_dims[1];
   } else {
-    xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+    xx_width = x_mat_dims[1] > wx_dims[1] ? wx_dims[1] : x_mat_dims[1];
     OP_INOUT_CHECK(ctx->HasOutput("ReorderedH0"), "Output", "ReorderedH0",
                    "fusion_gru");
     OP_INOUT_CHECK(ctx->HasOutput("BatchedInput"), "Output", "BatchedInput",
                    "fusion_gru");
     OP_INOUT_CHECK(ctx->HasOutput("BatchedOut"), "Output", "BatchedOut",
                    "fusion_gru");
-    ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
+    ctx->SetOutputDim("BatchedInput", {x_mat_dims[0], wx_dims[1]});
     ctx->SetOutputDim("BatchedOut", out_dims);
   }
-  ctx->SetOutputDim("XX", {x_dims[0], xx_width});
+  ctx->SetOutputDim("XX", {x_mat_dims[0], xx_width});
   ctx->ShareLoD("X", "XX");
 }
 
@@ -202,6 +206,27 @@ void FusionGRUOpMaker::Make() {
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
+  AddAttr<std::string>(
+      "mkldnn_data_type",
+      "(string, default \"float32\"). Data type of mkldnn kernel")
+      .SetDefault("float32")
+      .InEnum({"float32", "int8", "bfloat16"});
+  AddAttr<float>("Scale_data",
+                 "Scale to be used for int8 input/output data."
+                 "Only used with MKL-DNN INT8.")
+      .SetDefault(1.0f);
+  AddAttr<float>("Shift_data",
+                 "Shift to be used for int8 input/output data."
+                 "Only used with MKL-DNN INT8.")
+      .SetDefault(0.0f);
+  AddAttr<std::vector<float>>("Scale_weights",
+                              "Scale_weights to be used for int8 weights data."
+                              "Only used with MKL-DNN INT8.")
+      .SetDefault({1.0f});
+  AddAttr<bool>("force_fp32_output",
+                "(bool, default false) Force INT8 kernel output FP32, only "
+                "used in MKL-DNN INT8")
+      .SetDefault(false);
   AddComment(R"DOC(
 The Fusion complete GRU Operator.
 This operator fuse the fully-connected operator into GRU, 
@@ -220,14 +245,17 @@ class FusionGRUKernel : public framework::OpKernel<T> {
     }
   }
 
-#define INIT_BASE_DEFINES                  \
-  auto* x = ctx.Input<LoDTensor>("X");     \
-  auto* wh = ctx.Input<Tensor>("WeightH"); \
-  auto* xx = ctx.Output<LoDTensor>("XX");  \
-  auto x_lod = x->lod();                   \
-  auto x_dims = x->dims();   /* T x M*/    \
-  auto wh_dims = wh->dims(); /* D x 3D*/   \
-  const int total_T = x_dims[0];           \
+#define INIT_BASE_DEFINES                                     \
+  auto* x = ctx.Input<LoDTensor>("X");                        \
+  auto* wh = ctx.Input<Tensor>("WeightH");                    \
+  auto* xx = ctx.Output<LoDTensor>("XX");                     \
+  auto x_lod = x->lod();                                      \
+  auto x_dims = x->dims(); /* T x M*/                         \
+  auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)    \
+                        ? framework::flatten_to_2d(x_dims, 1) \
+                        : x_dims;                             \
+  auto wh_dims = wh->dims(); /* D x 3D*/                      \
+  const int total_T = x_mat_dims[0];                          \
   const int D3 = wh_dims[1]
 
 #define INIT_OTHER_DEFINES                                                   \
@@ -236,7 +264,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
   auto* bias = ctx.Input<Tensor>("Bias");                                    \
   auto* hidden_out = ctx.Output<LoDTensor>("Hidden");                        \
   bool is_reverse = ctx.Attr<bool>("is_reverse");                            \
-  const int M = x_dims[1];                                                   \
+  const int M = x_mat_dims[1];                                               \
   const int D = wh_dims[0];                                                  \
   const int D2 = D * 2;                                                      \
   const jit::gru_attr_t attr(                                                \
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
index 3940aae53b8ef70c15311305ce13f8929400d405..5fad1b116de6437e62e311318832ad77e24a40cc 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
@@ -21,11 +21,12 @@ namespace operators {
 using paddle::framework::LoDTensor;
 using paddle::framework::Tensor;
 using paddle::platform::CPUDeviceContext;
+using paddle::platform::CreateKey;
 using paddle::platform::MKLDNNGetDataType;
 using paddle::platform::MKLDNNMemDesc;
 using platform::to_void_cast;
 
-template <typename T>
+template <typename T, typename T_out = T>
 class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
  public:
   GRUMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
@@ -38,7 +39,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
                    const std::string& unique_name)
       : platform::MKLDNNHandlerT<T, dnnl::gru_forward>(
             dev_ctx, dev_ctx.GetEngine(), cpu_place,
-            platform::CreateKey(unique_name, Ti)),
+            CreateKey(unique_name, MKLDNNGetDataType<T>(), Ti)),
         N(N),
         Ti(Ti),
         IC(IC),
@@ -47,9 +48,29 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
     // do not depend on Ti size but primitive and input/output memory do
     if (platform::MKLDNNDeviceContext::tls().get_cur_mkldnn_session_id() !=
         platform::MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_Default) {
-      memory_key_ = unique_name;
+      memory_key_ = CreateKey(unique_name, MKLDNNGetDataType<T>());
     } else {
-      memory_key_ = unique_name + "-t:" + platform::ThreadIDasStr();
+      memory_key_ = CreateKey(unique_name, MKLDNNGetDataType<T>(), "-t:",
+                              platform::ThreadIDasStr());
+    }
+
+    // Is it int8 kernel
+    const bool is_INT8 = std::is_same<T, uint8_t>::value;
+
+    if (is_INT8) {
+      // Int8 attributes
+      const float scale_data = ctx.Attr<float>("Scale_data");
+      const float shift_data = ctx.Attr<float>("Shift_data");
+      const auto scale_weights = ctx.Attr<std::vector<float>>("Scale_weights");
+
+      const int weights_scale_mask =
+          0 +
+          (1 << 3)  // bit, indicating the unique scales for `g` dim in `ldigo`
+          +
+          (1 << 4);  // bit, indicating the unique scales for `o` dim in `ldigo`
+
+      attr_.set_rnn_data_qparams(scale_data, shift_data);
+      attr_.set_rnn_weights_qparams(weights_scale_mask, scale_weights);
     }
 
     if (!this->isCached()) {
@@ -63,6 +84,10 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
           platform::errors::Unimplemented(
               "oneDNN fusion_gru supports only tanh as an activation."));
 
+      // Weights for int8 kernel are of a type s8
+      const auto weights_dt =
+          is_INT8 ? dnnl::memory::data_type::s8 : dnnl::memory::data_type::f32;
+
       // oneDNN RNN dimensions
       const int64_t D = 1;  // Directions
       const int64_t L = 1;  // Layers (PP supports only 1 stacked layer)
@@ -71,19 +96,16 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
       // Create memory descriptors
       auto input_md = MKLDNNMemDesc({Ti, N, IC}, MKLDNNGetDataType<T>(),
                                     MKLDNNMemoryFormat::any);
-      auto weight_x_md = MKLDNNMemDesc(
-          {L, D, IC, G, OC}, MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::any);
-      auto weight_h_md = MKLDNNMemDesc(
-          {L, D, OC, G, OC}, MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::any);
+      auto weight_x_md =
+          MKLDNNMemDesc({L, D, IC, G, OC}, weights_dt, MKLDNNMemoryFormat::any);
+      auto weight_h_md =
+          MKLDNNMemDesc({L, D, OC, G, OC}, weights_dt, MKLDNNMemoryFormat::any);
       auto bias_md = MKLDNNMemDesc({L, D, G, OC}, MKLDNNGetDataType<float>(),
                                    MKLDNNMemoryFormat::ldgo);
-      auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T>(),
+      auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T_out>(),
                                      MKLDNNMemoryFormat::any);
-      auto h0_md = dnnl::memory::desc();
-      if (h0) {
-        h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
-                              MKLDNNMemoryFormat::ldnc);
-      }
+      auto h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
+                                 MKLDNNMemoryFormat::ldnc);
 
       // Create GRU oneDNN primitive
       const auto direction =
@@ -91,7 +113,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
                      : dnnl::rnn_direction::unidirectional_left2right;
 
       this->AcquireForwardPrimitiveDescriptor(
-          dnnl::prop_kind::forward_inference, direction, input_md, h0_md,
+          attr_, dnnl::prop_kind::forward_inference, direction, input_md, h0_md,
           weight_x_md, weight_h_md, bias_md, hidden_md, dnnl::memory::desc());
     }
   }
@@ -101,29 +123,31 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
             dnnl::memory::format_tag::ntc);
   }
 
-  void reorderRNNdata(const T* input_data, T* output_data,
+  void reorderRNNdata(void* input_data, void* output_data,
                       std::vector<size_t> lod, const bool is_reverse,
                       platform::RNNReorderType reorder_type) {
     switch (reorder_type) {
       // Reorder input memory [WORDS, C] + LoD -> [N, T, C]
       case platform::RNNReorderType::PP_NTC: {
-        auto* input_data_iter = input_data;
+        auto* input_data_iter = reinterpret_cast<T*>(input_data);
+        auto* output_data_iter = reinterpret_cast<T*>(output_data);
         for (int n = 0; n < N; ++n) {
           const auto num_elements = (lod[n + 1] - lod[n]) * IC;
           const auto offset = is_reverse ? (Ti * IC - num_elements) : 0;
-          memcpy(output_data + n * Ti * IC + offset, input_data_iter,
+          memcpy(output_data_iter + n * Ti * IC + offset, input_data_iter,
                  sizeof(T) * num_elements);
           input_data_iter += num_elements;
         }
       } break;
       // Reorder input memory [WORDS, C] + LoD -> [T, N, C]
       case platform::RNNReorderType::PP_TNC: {
-        auto* input_data_iter = input_data;
+        auto* input_data_iter = reinterpret_cast<T*>(input_data);
+        auto* output_data_iter = reinterpret_cast<T*>(output_data);
         for (int n = 0; n < N; ++n) {
           const auto num_elements = (lod[n + 1] - lod[n]);
           const auto offset = is_reverse ? (Ti - num_elements) : 0;
           for (size_t t = 0; t < num_elements; ++t) {
-            memcpy(output_data + (t + offset) * N * IC + n * IC,
+            memcpy(output_data_iter + (t + offset) * N * IC + n * IC,
                    input_data_iter, sizeof(T) * IC);
             input_data_iter += IC;
           }
@@ -131,24 +155,27 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
       } break;
       // Reorder output values to PP format [N, T, C] -> [WORDS, C]
       case platform::RNNReorderType::NTC_PP: {
-        auto* output_data_iter = output_data;
+        auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
+        auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
         for (int n = 0; n < N; ++n) {
           const auto num_elements = (lod[n + 1] - lod[n]) * OC;
           const auto offset = is_reverse ? (Ti * OC - num_elements) : 0;
-          memcpy(output_data_iter, input_data + n * Ti * OC + offset,
-                 sizeof(T) * num_elements);
+          memcpy(output_data_iter, input_data_iter + n * Ti * OC + offset,
+                 sizeof(T_out) * num_elements);
           output_data_iter += num_elements;
         }
       } break;
       // Reorder output values to PP format [T, N, C] -> [WORDS, C]
       case platform::RNNReorderType::TNC_PP: {
-        auto* output_data_iter = output_data;
+        auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
+        auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
         for (int n = 0; n < N; ++n) {
           const auto num_elements = lod[n + 1] - lod[n];
           const auto offset = is_reverse ? (Ti - num_elements) : 0;
           for (size_t t = 0; t < num_elements; ++t) {
             memcpy(output_data_iter,
-                   input_data + (t + offset) * N * OC + n * OC, sizeof(T) * OC);
+                   input_data_iter + (t + offset) * N * OC + n * OC,
+                   sizeof(T_out) * OC);
             output_data_iter += OC;
           }
         }
@@ -169,9 +196,9 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
     }
 
     const auto& input_lod = input->lod()[0];
-    auto* x_data = input->data<T>();
+    auto* x_data = to_void_cast(input->data<T>());
 
-    auto* x_onednn_data = reinterpret_cast<T*>(memory_p->get_data_handle());
+    auto* x_onednn_data = memory_p->get_data_handle();
     memset(x_onednn_data, 0, sizeof(T) * N * Ti * IC);
 
     if (platform::GetMKLDNNFormat(this->fwd_pd_->src_desc()) ==
@@ -198,19 +225,35 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
     return memory_p;
   }
 
+  // TODO(grygielski) H0 is for now persistable
   std::shared_ptr<dnnl::memory> AcquireH0Memory(const Tensor* h0) {
     const std::string h0_key = memory_key_ + "@h0";
     auto memory_p =
         std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(h0_key));
 
-    auto* h0_data = to_void_cast(h0->data<T>());
-
     if (!memory_p) {
-      memory_p = std::make_shared<dnnl::memory>(
-          this->fwd_pd_->weights_layer_desc(), this->engine_, h0_data);
+      auto user_h0_memory = dnnl::memory();
+      if (h0) {
+        user_h0_memory =
+            dnnl::memory({{1, 1, N, OC},
+                          MKLDNNGetDataType<float>(),
+                          MKLDNNMemoryFormat::ldnc},
+                         this->engine_, to_void_cast(h0->data<float>()));
+      } else {
+        user_h0_memory = dnnl::memory({{1, 1, N, OC},
+                                       MKLDNNGetDataType<float>(),
+                                       MKLDNNMemoryFormat::ldnc},
+                                      this->engine_);
+        memset(user_h0_memory.get_data_handle(), 0, sizeof(float) * N * OC);
+      }
+      memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_iter_desc(),
+                                                this->engine_);
+
+      dnnl::stream astream(this->engine_);
+      dnnl::reorder(user_h0_memory, *memory_p, attr_)
+          .execute(astream, user_h0_memory, *memory_p);
+
       this->dev_ctx_.SetBlob(h0_key, memory_p);
-    } else {
-      memory_p->set_data_handle(h0_data);
     }
     return memory_p;
   }
@@ -245,7 +288,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
           this->fwd_pd_->weights_layer_desc(), this->engine_);
 
       dnnl::stream astream(this->engine_);
-      dnnl::reorder(user_memory, *memory_p)
+      dnnl::reorder(user_memory, *memory_p, attr_)
           .execute(astream, user_memory, *memory_p);
 
       this->dev_ctx_.SetBlob(wx_key, memory_p);
@@ -298,7 +341,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
           this->fwd_pd_->weights_iter_desc(), this->engine_);
 
       dnnl::stream astream(this->engine_);
-      dnnl::reorder(user_memory, *memory_p)
+      dnnl::reorder(user_memory, *memory_p, attr_)
           .execute(astream, user_memory, *memory_p);
 
       this->dev_ctx_.SetBlob(wh_key, memory_p);
@@ -347,12 +390,26 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
   // Memory size of weights, bias and h0 does not depend
   // on Ti size, thus we need another key to cache them
   std::string memory_key_;
+  dnnl::primitive_attr attr_;
 };
 
 template <typename T>
 class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    const bool is_INT8 = std::is_same<T, uint8_t>::value;
+    const bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
+
+    // TODO(grygielski) Add option for bfloat
+    if (!is_INT8 || force_fp32_output) {
+      RunKernel<float>(ctx);
+    } else {
+      RunKernel<uint8_t>(ctx);
+    }
+  }
+
+  template <typename Tout = T>
+  void RunKernel(const framework::ExecutionContext& ctx) const {
     auto& dev_ctx =
         ctx.template device_context<platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
@@ -364,13 +421,16 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
     const auto* weight_h = ctx.Input<Tensor>("WeightH");
     const auto* bias = ctx.Input<Tensor>("Bias");
     auto* hidden = ctx.Output<LoDTensor>("Hidden");
-
+    auto x_dims = input->dims();
+    auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)
+                          ? framework::flatten_to_2d(x_dims, 1)
+                          : x_dims;
     // Get attributes
     const bool is_reverse = ctx.Attr<bool>("is_reverse");
     const bool origin_mode = ctx.Attr<bool>("origin_mode");
 
     // Get tensor dimensions
-    const auto x_dims = framework::vectorize(input->dims());
+    const auto x_mat_dims_vec = framework::vectorize(x_mat_dims);
     const auto weight_h_dims = framework::vectorize(weight_h->dims());
     const auto& input_lod = input->lod()[0];
 
@@ -384,15 +444,17 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
           }
           return res;
         }();
-    const int64_t IC = x_dims[1];         // Input channels
-    const int64_t OC = weight_h_dims[0];  // Output channels
+    const int64_t IC = x_mat_dims_vec[1];  // Input channels
+    const int64_t OC = weight_h_dims[0];   // Output channels
 
-    GRUMKLDNNHandler<T> handler(ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(),
-                                input, weight_h, h0, is_reverse, N, Ti, IC, OC,
-                                ctx.InputName("X") + ctx.InputName("WeightH"));
+    GRUMKLDNNHandler<T, Tout> handler(
+        ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, weight_h, h0,
+        is_reverse, N, Ti, IC, OC,
+        ctx.InputName("X") + ctx.InputName("WeightH"));
 
     auto input_memory_p =
         handler.AcquireInputMemoryWithReorder(input, is_reverse);
+    auto h0_memory_p = handler.AcquireH0Memory(h0);
     auto weight_x_memory_p =
         handler.AcquireWeightXMemory(weight_x, origin_mode);
     auto weight_h_memory_p =
@@ -402,25 +464,21 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
 
     std::unordered_map<int, dnnl::memory> gru_args = {
         {DNNL_ARG_SRC_LAYER, *input_memory_p},
+        {DNNL_ARG_SRC_ITER, *h0_memory_p},
         {DNNL_ARG_WEIGHTS_LAYER, *weight_x_memory_p},
         {DNNL_ARG_WEIGHTS_ITER, *weight_h_memory_p},
         {DNNL_ARG_BIAS, *bias_memory_p},
         {DNNL_ARG_DST_LAYER, *hidden_onednn_memory_p}};
 
-    if (h0) {
-      auto h0_memory_p = handler.AcquireH0Memory(h0);
-      gru_args.insert({DNNL_ARG_SRC_ITER, *h0_memory_p});
-    }
-
     auto gru_forward_p = handler.AcquireForwardPrimitive();
 
     dnnl::stream astream(mkldnn_engine);
     gru_forward_p->execute(astream, gru_args);
     astream.wait();
 
-    auto* hidden_onednn_data =
-        reinterpret_cast<T*>(hidden_onednn_memory_p->get_data_handle());
-    auto* hidden_data = hidden->mutable_data<T>(ctx.GetPlace());
+    auto* hidden_onednn_data = hidden_onednn_memory_p->get_data_handle();
+    auto* hidden_data =
+        to_void_cast(hidden->mutable_data<Tout>(ctx.GetPlace()));
     if (handler.is_NTC()) {
       handler.reorderRNNdata(hidden_onednn_data, hidden_data, input_lod,
                              is_reverse, platform::RNNReorderType::NTC_PP);
@@ -436,4 +494,5 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(fusion_gru, MKLDNN, paddle::platform::CPUPlace,
-                   ops::FusionGRUMKLDNNKernel<float>);
+                   ops::FusionGRUMKLDNNKernel<float>,
+                   ops::FusionGRUMKLDNNKernel<uint8_t>);
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 28afeb6f541c68fe7e0719a782fd8c9147b15163..a99879316d684ca95e73ce8db43e988efcbab4c4 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -37,8 +37,21 @@ class GatherOp : public framework::OperatorWithKernel {
                           "Output(Out) of GatherOp should not be null."));
 
     auto index_dims = ctx->GetInputDim("Index");
-    PADDLE_ENFORCE(index_dims.size() == 1 ||
-                   (index_dims.size() == 2 && index_dims[1] == 1));
+
+    if (index_dims.size() == 2) {
+      PADDLE_ENFORCE_EQ(
+          index_dims[1], 1,
+          platform::errors::InvalidArgument(
+              "The last dim of index should be 1 when it is 2D, but we get %d",
+              index_dims[1]));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          index_dims.size(), 1,
+          platform::errors::InvalidArgument(
+              "The index should be 1D, when it is not 2D, but we get %d",
+              index_dims.size()));
+    }
+
     int batch_size = ctx->GetInputDim("Index")[0];
     framework::DDim output_dims(ctx->GetInputDim("X"));
     output_dims[0] = batch_size;
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index 4f128463375b91803a7a4d02a27dd78157961aac..17a71c67b8a084c114497eb97568e9b536161711 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -34,8 +34,7 @@ class CPUGaussianRandomKernel : public framework::OpKernel<T> {
     auto* tensor = context.Output<framework::Tensor>("Out");
 
     std::normal_distribution<T> dist(mean, std);
-    const std::string op_type = "gaussian_random";
-    auto shape = GetShape(context, op_type);
+    auto shape = GetShape(context);
     tensor->Resize(shape);
     int64_t size = tensor->numel();
     T* data = tensor->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index 69c8b60040651179784cd6b77c31c66e892231be..7a0c93eb1b2eaa7afaae7f0a604a0da5ac0fd75d 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -58,8 +58,7 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
     T mean = static_cast<T>(context.Attr<float>("mean"));
     T std = static_cast<T>(context.Attr<float>("std"));
     thrust::counting_iterator<unsigned int> index_sequence_begin(0);
-    const std::string op_type = "gaussian_random";
-    auto shape = GetShape(context, op_type);
+    auto shape = GetShape(context);
     tensor->Resize(shape);
     T* data = tensor->mutable_data<T>(context.GetPlace());
 
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index deb71b807128e5c0b173b517e60832894ced41e5..f5224239eb2ded9a156aadc9185eca89f4e3396f 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -115,7 +115,7 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<std::string>(
         "padding_mode",
         "(bool, default true) The padding method used when source"
-        "index is out of input images. It can be 'zeros', 'reflect' and "
+        "index is out of input images. It can be 'zeros', 'reflection' and "
         "'border'.")
         .SetDefault("zeros");
 
@@ -174,6 +174,10 @@ class GridSampleOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
   void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
+                   framework::GradVarName("X"), "grid_sampler");
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Grid")), "Output",
+                   framework::GradVarName("Grid"), "grid_sampler");
     auto input_dims = ctx->GetInputDim("X");
     auto grid_dims = ctx->GetInputDim("Grid");
     if (ctx->HasOutput(framework::GradVarName("X"))) {
diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu
index 999f990448ca6370dadacbdaee5bf3bcadcaca0e..4e61d0c2ea7f91e4199c3e9daa3e93ac45bc0eb8 100644
--- a/paddle/fluid/operators/grid_sampler_op.cu
+++ b/paddle/fluid/operators/grid_sampler_op.cu
@@ -268,7 +268,7 @@ class GridSampleOpCUDAKernel : public framework::OpKernel<T> {
     Mode mode;
     if (padding_mode_s == "border") {
       padding_mode = PaddingMode::border;
-    } else if (padding_mode_s == "reflect") {
+    } else if (padding_mode_s == "reflection") {
       padding_mode = PaddingMode::reflect;
     } else {
       padding_mode = PaddingMode::zeros;
@@ -432,7 +432,7 @@ class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> {
     Mode mode;
     if (padding_mode_s == "border") {
       padding_mode = PaddingMode::border;
-    } else if (padding_mode_s == "reflect") {
+    } else if (padding_mode_s == "reflection") {
       padding_mode = PaddingMode::reflect;
     } else {
       padding_mode = PaddingMode::zeros;
diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h
index eda800e78faf5da2bb379b8101e4823c5bc2d2f8..b8faef759ae90e14d1e83b66130bfe957b51907b 100644
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
@@ -76,7 +76,7 @@ static inline void clip(const platform::CPUDeviceContext& ctx,
   if (padding_mode == "border") {
     grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
                                      .cwiseMin(static_cast<T>(max_val));
-  } else if (padding_mode == "reflect") {
+  } else if (padding_mode == "reflection") {
     if (align_corners) {
       auto double_range = static_cast<T>(max_val * 2);
       auto grid_abs = grid_slice_t.abs();
@@ -117,7 +117,7 @@ static inline void clipWithMask(const platform::CPUDeviceContext& ctx,
     auto in_bound = (res == grid_slice_t);
     grid_scale_t.device(place) = grid_scale_t * in_bound.template cast<T>();
     grid_slice_t.device(place) = res;
-  } else if (padding_mode == "reflect") {
+  } else if (padding_mode == "reflection") {
     if (align_corners) {
       auto double_range = static_cast<T>(max_val * 2);
       auto is_neg = (grid_slice_t < static_cast<T>(0));
diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc
index 12733a0d9f1689a020f77d23cc31b0d19b412746..1f7dde9b931dafa4b8e0bee211e64461b1c21dc5 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cc
+++ b/paddle/fluid/operators/interpolate_v2_op.cc
@@ -67,7 +67,7 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) {
         scale_tensor[0], 1,
         platform::errors::InvalidArgument(
             "Scale's shape must be 1, but got shape = %d .", scale_tensor[0]));
-    // out_w = -1;
+    out_w = -1;
   } else {
     auto scale = ctx->Attrs().Get<std::vector<float>>("scale");
     if (scale.size() > 0) {
@@ -159,8 +159,8 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
                       platform::errors::InvalidArgument(
                           "Scale's shape must be 2 or 1, but got shape = %d .",
                           scale_tensor[0]));
-    // out_h = -1;
-    // out_w = -1;
+    out_h = -1;
+    out_w = -1;
   } else {
     auto scale = ctx->Attrs().Get<std::vector<float>>("scale");
     if (scale.size() > 0) {
@@ -264,9 +264,9 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
                       platform::errors::InvalidArgument(
                           "Scale's shape must be 3 or 1, but got shape = %d .",
                           scale_tensor[0]));
-    // out_d = -1;
-    // out_h = -1;
-    // out_w = -1;
+    out_d = -1;
+    out_h = -1;
+    out_w = -1;
   } else {
     auto scale = ctx->Attrs().Get<std::vector<float>>("scale");
     if (scale.size() > 0) {
@@ -633,6 +633,9 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(InterpolateV2GradNoNeedBufferVarsInferer,
 }  // namespace operators
 }  // namespace paddle
 
+// interp_v2 support scale_factor whose input type is list, this operation is
+// not
+// compatible with interp_op, so a new one is added in paddle2.0
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(bilinear_interp_v2, ops::InterpolateV2Op,
                   ops::InterpolateV2OpMaker,
diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu
index 6cb8104638dea458743374014e7bef35df2dbfcc..816539c3b5fdb805d16fb8224b7c960f797613cb 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -836,12 +836,12 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
   int out_w = ctx.Attr<int>("out_w");
 
   auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  float scale_w = -1;
   if (list_new_shape_tensor.size() > 0) {
     // have size tensor
     auto new_size = get_new_shape(list_new_shape_tensor);
     out_w = new_size[0];
   } else {
-    float scale_w = -1;
     auto scale_tensor = ctx.Input<Tensor>("Scale");
     auto scale = ctx.Attr<std::vector<float>>("scale");
     if (scale_tensor != nullptr) {
@@ -887,8 +887,11 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
 
   float ratio_w = 0.f;
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1.0) / (out_w - 1.0)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   int in_cw = c * in_w;
@@ -924,14 +927,14 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
   int out_w = ctx.Attr<int>("out_w");
 
   auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  float scale_w = -1;
+  float scale_h = -1;
   if (list_new_shape_tensor.size() > 0) {
     // have size tensor
     auto new_size = get_new_shape(list_new_shape_tensor);
     out_h = new_size[0];
     out_w = new_size[1];
   } else {
-    float scale_h = -1;
-    float scale_w = -1;
     auto scale_tensor = ctx.Input<Tensor>("Scale");
     auto scale = ctx.Attr<std::vector<float>>("scale");
     if (scale_tensor != nullptr) {
@@ -993,12 +996,18 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
   float ratio_h = 0.f;
   float ratio_w = 0.f;
   if (out_h > 1) {
+    float new_scale_h = 0.f;
+    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
+                                : static_cast<float>(in_h) / out_h;
     ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
+                              : static_cast<float>(new_scale_h);
   }
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   int in_hw = in_h * in_w;
@@ -1048,6 +1057,9 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
   int out_w = ctx.Attr<int>("out_w");
 
   auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  float scale_w = -1;
+  float scale_d = -1;
+  float scale_h = -1;
   if (list_new_shape_tensor.size() > 0) {
     // have size tensor
     auto new_size = get_new_shape(list_new_shape_tensor);
@@ -1055,9 +1067,6 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
     out_h = new_size[1];
     out_w = new_size[2];
   } else {
-    float scale_d = -1;
-    float scale_h = -1;
-    float scale_w = -1;
     auto scale_tensor = ctx.Input<Tensor>("Scale");
     auto scale = ctx.Attr<std::vector<float>>("scale");
     if (scale_tensor != nullptr) {
@@ -1129,16 +1138,25 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
   float ratio_h = 0.f;
   float ratio_w = 0.f;
   if (out_d > 1) {
+    float new_scale_d = 0.f;
+    new_scale_d = (scale_d > 0) ? static_cast<float>(1. / scale_d)
+                                : static_cast<float>(in_d) / out_d;
     ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
-                              : static_cast<float>(in_d) / out_d;
+                              : static_cast<float>(new_scale_d);
   }
   if (out_h > 1) {
+    float new_scale_h = 0.f;
+    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
+                                : static_cast<float>(in_h) / out_h;
     ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
+                              : static_cast<float>(new_scale_h);
   }
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   int in_dhw = in_d * in_h * in_w;
@@ -1230,8 +1248,11 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
 
   float ratio_w = 0.f;
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
   int in_cw = c * in_w;
   int out_cw = c * out_w;
@@ -1333,12 +1354,18 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
   float ratio_h = 0.f;
   float ratio_w = 0.f;
   if (out_h > 1) {
+    float new_scale_h = 0.f;
+    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
+                                : static_cast<float>(in_h) / out_h;
     ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
+                              : static_cast<float>(new_scale_h);
   }
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   int in_hw = in_h * in_w;
@@ -1464,16 +1491,25 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
   float ratio_h = 0.f;
   float ratio_w = 0.f;
   if (out_d > 1) {
+    float new_scale_d = 0.f;
+    new_scale_d = (scale_d > 0) ? static_cast<float>(1. / scale_d)
+                                : static_cast<float>(in_d) / out_d;
     ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
-                              : static_cast<float>(in_d) / out_d;
+                              : static_cast<float>(new_scale_d);
   }
   if (out_h > 1) {
+    float new_scale_h = 0.f;
+    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
+                                : static_cast<float>(in_h) / out_h;
     ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
+                              : static_cast<float>(new_scale_h);
   }
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   int in_dhw = in_d * in_h * in_w;
diff --git a/paddle/fluid/operators/interpolate_v2_op.h b/paddle/fluid/operators/interpolate_v2_op.h
index 111766934b8300c0a7b46ae9a065b8c42460e577..4e4fd9ff63ba47b41363a81d6cc527486671d695 100644
--- a/paddle/fluid/operators/interpolate_v2_op.h
+++ b/paddle/fluid/operators/interpolate_v2_op.h
@@ -783,12 +783,13 @@ static void Interpolate1DCPUFwd(const framework::ExecutionContext& ctx,
 
   int out_w = ctx.Attr<int>("out_w");
   auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  float scale_w = -1.;
   if (list_new_size_tensor.size() > 0) {
     // have size tensor
     auto new_size = get_new_shape(list_new_size_tensor);
     out_w = new_size[0];
   } else {
-    float scale_w = -1;
+    // float scale_w = -1;
     auto scale_tensor = ctx.Input<Tensor>("Scale");
     auto scale = ctx.Attr<std::vector<float>>("scale");
     if (scale_tensor != nullptr) {
@@ -833,8 +834,11 @@ static void Interpolate1DCPUFwd(const framework::ExecutionContext& ctx,
 
   float ratio_w = 0.f;
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
   if ("linear" == interp_method) {
     LinearInterpolation<T>(input, output, ratio_w, in_w, n, c, out_w,
@@ -856,6 +860,8 @@ static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
 
   int out_h = ctx.Attr<int>("out_h");
   int out_w = ctx.Attr<int>("out_w");
+  float scale_h = -1;
+  float scale_w = -1;
 
   auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
   if (list_new_size_tensor.size() > 0) {
@@ -864,8 +870,6 @@ static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
     out_h = new_size[0];
     out_w = new_size[1];
   } else {
-    float scale_h = -1;
-    float scale_w = -1;
     auto scale_tensor = ctx.Input<Tensor>("Scale");
     auto scale = ctx.Attr<std::vector<float>>("scale");
     if (scale_tensor != nullptr) {
@@ -925,12 +929,18 @@ static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
   float ratio_h = 0.f;
   float ratio_w = 0.f;
   if (out_h > 1) {
+    float new_scale_h = 0.f;
+    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
+                                : static_cast<float>(in_h) / out_h;
     ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
+                              : static_cast<float>(new_scale_h);
   }
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   if ("bilinear" == interp_method) {
@@ -962,6 +972,10 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
   int out_h = ctx.Attr<int>("out_h");
   int out_w = ctx.Attr<int>("out_w");
 
+  float scale_d = -1;
+  float scale_h = -1;
+  float scale_w = -1;
+
   auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
   if (list_new_size_tensor.size() > 0) {
     // have size tensor
@@ -970,9 +984,6 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
     out_h = new_size[1];
     out_w = new_size[2];
   } else {
-    float scale_d = -1;
-    float scale_h = -1;
-    float scale_w = -1;
     auto scale_tensor = ctx.Input<Tensor>("Scale");
     auto scale = ctx.Attr<std::vector<float>>("scale");
     if (scale_tensor != nullptr) {
@@ -1043,16 +1054,25 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
   float ratio_h = 0.f;
   float ratio_w = 0.f;
   if (out_d > 1) {
+    float new_scale_d = 0.f;
+    new_scale_d = (scale_d > 0) ? static_cast<float>(1. / scale_d)
+                                : static_cast<float>(in_d) / out_d;
     ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
-                              : static_cast<float>(in_d) / out_d;
+                              : static_cast<float>(new_scale_d);
   }
   if (out_h > 1) {
+    float new_scale_h = 0.f;
+    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
+                                : static_cast<float>(in_h) / out_h;
     ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
+                              : static_cast<float>(new_scale_h);
   }
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   if ("trilinear" == interp_method) {
@@ -1127,8 +1147,11 @@ static void Interpolate1DCPUBwd(const framework::ExecutionContext& ctx,
 
   float ratio_w = 0.f;
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
   if ("linear" == interp_method) {
     LinearInterpolationGrad<T>(output_grad, input_grad, ratio_w, in_w, n, c,
@@ -1216,12 +1239,18 @@ static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
   float ratio_h = 0.f;
   float ratio_w = 0.f;
   if (out_h > 1) {
+    float new_scale_h = 0.f;
+    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
+                                : static_cast<float>(in_h) / out_h;
     ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
+                              : static_cast<float>(new_scale_h);
   }
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   if ("bilinear" == interp_method) {
@@ -1327,16 +1356,25 @@ static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
   float ratio_h = 0.f;
   float ratio_w = 0.f;
   if (out_d > 1) {
+    float new_scale_d = 0.f;
+    new_scale_d = (scale_d > 0) ? static_cast<float>(1. / scale_d)
+                                : static_cast<float>(in_d) / out_d;
     ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
-                              : static_cast<float>(in_d) / out_d;
+                              : static_cast<float>(new_scale_d);
   }
   if (out_h > 1) {
+    float new_scale_h = 0.f;
+    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
+                                : static_cast<float>(in_h) / out_h;
     ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
+                              : static_cast<float>(new_scale_h);
   }
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   if ("trilinear" == interp_method) {
diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc
index af737ec42f631c534bb26ad38901e03d804d07b3..9b92ce3e538aa660dedda67de0cabaa4adbdc8c7 100644
--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
@@ -43,7 +43,11 @@ class OverflowOp : public framework::OperatorWithKernel {
     } else if (x_var->IsType<framework::SelectedRows>()) {
       dtype = x_var->Get<framework::SelectedRows>().value().type();
     } else {
-      PADDLE_THROW("Cannot find the input data type by all input data");
+      PADDLE_ENFORCE_EQ(
+          true, false,
+          platform::errors::InvalidArgument(
+              "The input type mismatch, the type of Input(X) must be Tensor or "
+              "SelectedRows, please check your input."));
     }
     return framework::OpKernelType(framework::proto::VarType::Type(dtype),
                                    ctx.GetPlace());
diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h
index 83b080856366ac3332c5856a19b721893bb80eb3..2fc0d58669bae428d811c7200e025f36f087b905 100644
--- a/paddle/fluid/operators/isfinite_op.h
+++ b/paddle/fluid/operators/isfinite_op.h
@@ -57,7 +57,11 @@ class OverflowKernel : public framework::OpKernel<T> {
       auto& in = ctx.Input<framework::SelectedRows>("X")->value();
       functor(in, out);
     } else {
-      PADDLE_THROW("Unsupported input type.");
+      PADDLE_ENFORCE_EQ(
+          true, false,
+          platform::errors::InvalidArgument(
+              "The input type mismatch, the type of Input(X) must be Tensor or "
+              "SelectedRows, please check your input."));
     }
   }
 };
diff --git a/paddle/fluid/operators/kldiv_loss_op.h b/paddle/fluid/operators/kldiv_loss_op.h
index 369fdb4872b4184d706a5264b58f70f63051fca1..857ecda303c2607b1b6fb9a5d2ec132b335d6c29 100644
--- a/paddle/fluid/operators/kldiv_loss_op.h
+++ b/paddle/fluid/operators/kldiv_loss_op.h
@@ -72,7 +72,11 @@ class KLDivLossKernel : public framework::OpKernel<T> {
       loss_t.device(place) = output;
     } else if ("batchmean" == reduction) {
       auto output_sum = output.sum();
-      loss_t.device(place) = output_sum / output_sum.constant(n);
+      if (n > 0) {
+        loss_t.device(place) = output_sum / output_sum.constant(n);
+      } else {
+        loss_t.device(place) = output_sum;
+      }
     } else if ("mean" == reduction) {
       loss_t.device(place) = output.mean();
     } else if ("sum" == reduction) {
diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc
index 2c3172d2a1112e2c79a3c1215ccd0d3f08d59451..667c6e892956e29478f1401c3cb2622713433037 100644
--- a/paddle/fluid/operators/linspace_op.cc
+++ b/paddle/fluid/operators/linspace_op.cc
@@ -22,8 +22,6 @@ class LinspaceOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Start"),
-                   "Input(Start) of LinspaceOp should not be null.");
     OP_INOUT_CHECK(ctx->HasInput("Start"), "Input", "Start", "linspace");
     OP_INOUT_CHECK(ctx->HasInput("Stop"), "Input", "Stop", "linspace");
     OP_INOUT_CHECK(ctx->HasInput("Num"), "Input", "Num", "linspace");
diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu
index 793253b6b8894de8d89b301921383ebfd53d66fc..c51e8785263b5de7a897f3865ed2dabdf93adfaa 100644
--- a/paddle/fluid/operators/linspace_op.cu
+++ b/paddle/fluid/operators/linspace_op.cu
@@ -63,7 +63,10 @@ class CUDALinspaceKernel : public framework::OpKernel<T> {
     framework::TensorCopy(*num_t, platform::CPUPlace(), &n);
     int32_t num = n.data<int32_t>()[0];
 
-    PADDLE_ENFORCE(num > 0, "The num of linspace op should be larger than 0.");
+    PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument(
+                                  "The num of linspace op should be larger "
+                                  "than 0, but received num is %d",
+                                  num));
 
     out->Resize(framework::make_ddim({num}));
     T* out_data = out->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/linspace_op.h b/paddle/fluid/operators/linspace_op.h
index 898f611f864dc8bfac2ba7e41b91f5f5bbe524ab..2c30a66ef8e937127fb69a459a901164934b5b13 100644
--- a/paddle/fluid/operators/linspace_op.h
+++ b/paddle/fluid/operators/linspace_op.h
@@ -46,7 +46,10 @@ class CPULinspaceKernel : public framework::OpKernel<T> {
 
     T start = start_t.data<T>()[0];
     T stop = stop_t.data<T>()[0];
-    PADDLE_ENFORCE(num > 0, "The num of linspace op should be larger than 0.");
+    PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument(
+                                  "The num of linspace op should be larger "
+                                  "than 0, but received num is %d",
+                                  num));
 
     out->Resize(framework::make_ddim({num}));
 
diff --git a/paddle/fluid/operators/lite/lite_engine_op.h b/paddle/fluid/operators/lite/lite_engine_op.h
index a920bf7c3f505b839f8f1fd252c9f8505393f3a9..f6d65704388e6ec90c9209475e5f4b19061085fa 100644
--- a/paddle/fluid/operators/lite/lite_engine_op.h
+++ b/paddle/fluid/operators/lite/lite_engine_op.h
@@ -39,7 +39,7 @@ class LiteEngineOp : public framework::OperatorBase {
  private:
   std::vector<std::string> in_names_;
   std::vector<std::string> out_names_;
-  paddle::lite::Predictor *engine_;
+  paddle::lite_api::PaddlePredictor *engine_;
   framework::proto::VarType::Type precision_;
   bool use_gpu_;
   bool zero_copy_;
@@ -78,10 +78,10 @@ class LiteEngineOp : public framework::OperatorBase {
       framework::LoDTensor src_t =
           inference::analysis::GetFromScope<framework::LoDTensor>(scope,
                                                                   in_names_[i]);
-      paddle::lite::Tensor *dst_t = engine_->GetInput(i);
+      paddle::lite_api::Tensor dst_t = *(engine_->GetInput(i));
       VLOG(3) << "== fluid -> lite (" << in_names_[i] << " -> "
               << engine_->GetInputNames()[i] << ")";
-      inference::lite::utils::TensorCopy(dst_t, &src_t, *ctx, zero_copy_);
+      inference::lite::utils::TensorCopy(&dst_t, &src_t, *ctx, zero_copy_);
     }
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(dev_place)) {
@@ -93,7 +93,7 @@ class LiteEngineOp : public framework::OperatorBase {
     engine_->Run();
     VLOG(3) << "lite engine run done";
     for (size_t i = 0; i < out_names_.size(); i++) {
-      paddle::lite::Tensor src_t = *(engine_->GetOutput(i));
+      paddle::lite_api::Tensor src_t = *(engine_->GetOutput(i));
       framework::LoDTensor *dst_t =
           &inference::analysis::GetFromScope<framework::LoDTensor>(
               scope, out_names_[i]);
diff --git a/paddle/fluid/operators/lite/lite_engine_op_test.cc b/paddle/fluid/operators/lite/lite_engine_op_test.cc
index fb5c0dcb3514de815b97944d0fdbf3bd7853b628..76c963ac652687cb0f65a0497b5c994f82d0d7aa 100644
--- a/paddle/fluid/operators/lite/lite_engine_op_test.cc
+++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc
@@ -84,10 +84,10 @@ TEST(LiteEngineOp, engine_op) {
   inference::lite::EngineConfig config;
   config.valid_places = {
 #ifdef PADDLE_WITH_CUDA
-      paddle::lite::Place({TARGET(kCUDA), PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kCUDA), PRECISION(kFloat)}),
 #endif
-      paddle::lite::Place({TARGET(kHost), PRECISION(kAny)}),
-      paddle::lite::Place({TARGET(kX86), PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}),
   };
   serialize_params(&(config.param), &scope, repetitive_params);
   config.model = program.Proto()->SerializeAsString();
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 3a19c7edff3569d503480fd060a6432dc59d2108..10d335b828b516fe08871f314ba4667c06f04714 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -9,7 +9,11 @@ function(math_library TARGET)
     set(hip_srcs)
     set(math_common_deps device_context framework_proto enforce)
     if (WITH_GPU)
-        list(APPEND math_common_deps cub)
+        if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)	
+            list(APPEND math_common_deps cub)
+	else()
+            list(APPEND math_common_deps)
+	endif()
     endif()
     set(multiValueArgs DEPS)
     cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc
index 411dbca25bb48c99dfd16779f54e46a3e80d0d4e..270a9d3f80a80d5ea2c8b97d4a69125355ddef61 100644
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
@@ -79,8 +79,16 @@ void ConcatCase1(DeviceContext* context) {
   concat_functor(*context, input, 0, &out);
 
   // check the dim of input_a, input_b
-  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
-  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
+  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a,
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims of Input tensor should be the same as the "
+                        "declared dims. Tensor dims: [%s], declared dims: [%s]",
+                        input_a.dims(), dim_a));
+  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b,
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims of Input tensor should be the same as the "
+                        "declared dims. Tensor dims: [%s], declared dims: [%s]",
+                        input_b.dims(), dim_b));
 
   int* out_ptr = nullptr;
   if (paddle::platform::is_gpu_place(Place())) {
@@ -95,10 +103,14 @@ void ConcatCase1(DeviceContext* context) {
   int idx_a = 0, idx_b = 0;
   for (int j = 0; j < 5 * 3 * 4; ++j) {
     if (j >= cols) {
-      PADDLE_ENFORCE_EQ(out_ptr[j], b_ptr[idx_b]);
+      PADDLE_ENFORCE_EQ(out_ptr[j], b_ptr[idx_b],
+                        paddle::platform::errors::InvalidArgument(
+                            "Concat test failed, the result should be equal."));
       ++idx_b;
     } else {
-      PADDLE_ENFORCE_EQ(out_ptr[j], a_ptr[idx_a]);
+      PADDLE_ENFORCE_EQ(out_ptr[j], a_ptr[idx_a],
+                        paddle::platform::errors::InvalidArgument(
+                            "Concat test failed, the result should be equal."));
       ++idx_a;
     }
   }
@@ -166,8 +178,16 @@ void ConcatCase2(DeviceContext* context) {
   concat_functor(*context, input, 1, &out);
 
   // check the dim of input_a, input_b
-  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
-  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
+  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a,
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims of Input tensor should be the same as the "
+                        "declared dims. Tensor dims: [%s], declared dims: [%s]",
+                        input_a.dims(), dim_a));
+  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b,
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims of Input tensor should be the same as the "
+                        "declared dims. Tensor dims: [%s], declared dims: [%s]",
+                        input_b.dims(), dim_b));
 
   int* out_ptr = nullptr;
   if (paddle::platform::is_gpu_place(Place())) {
@@ -183,10 +203,16 @@ void ConcatCase2(DeviceContext* context) {
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 28; ++j) {
       if (j >= cols) {
-        PADDLE_ENFORCE_EQ(out_ptr[i * 28 + j], b_ptr[idx_b]);
+        PADDLE_ENFORCE_EQ(
+            out_ptr[i * 28 + j], b_ptr[idx_b],
+            paddle::platform::errors::InvalidArgument(
+                "Concat test failed, the result should be equal."));
         ++idx_b;
       } else {
-        PADDLE_ENFORCE_EQ(out_ptr[i * 28 + j], a_ptr[idx_a]);
+        PADDLE_ENFORCE_EQ(
+            out_ptr[i * 28 + j], a_ptr[idx_a],
+            paddle::platform::errors::InvalidArgument(
+                "Concat test failed, the result should be equal."));
         ++idx_a;
       }
     }
@@ -255,8 +281,16 @@ void ConcatCase3(DeviceContext* context) {
   concat_functor(*context, input, 2, &out);
 
   // check the dim of input_a, input_b
-  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
-  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
+  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a,
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims of Input tensor should be the same as the "
+                        "declared dims. Tensor dims: [%s], declared dims: [%s]",
+                        input_a.dims(), dim_a));
+  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b,
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims of Input tensor should be the same as the "
+                        "declared dims. Tensor dims: [%s], declared dims: [%s]",
+                        input_b.dims(), dim_b));
 
   int* out_ptr = nullptr;
   if (paddle::platform::is_gpu_place(Place())) {
@@ -273,10 +307,16 @@ void ConcatCase3(DeviceContext* context) {
   for (int i = 0; i < 6; ++i) {
     for (int j = 0; j < 9; ++j) {
       if (j >= cols) {
-        PADDLE_ENFORCE_EQ(out_ptr[i * 9 + j], b_ptr[idx_b]);
+        PADDLE_ENFORCE_EQ(
+            out_ptr[i * 9 + j], b_ptr[idx_b],
+            paddle::platform::errors::InvalidArgument(
+                "Concat test failed, the result should be equal."));
         ++idx_b;
       } else {
-        PADDLE_ENFORCE_EQ(out_ptr[i * 9 + j], a_ptr[idx_a]);
+        PADDLE_ENFORCE_EQ(
+            out_ptr[i * 9 + j], a_ptr[idx_a],
+            paddle::platform::errors::InvalidArgument(
+                "Concat test failed, the result should be equal."));
         ++idx_a;
       }
     }
@@ -347,8 +387,16 @@ void ConcatCase4(DeviceContext* context) {
   context->Wait();
 
   // check the dim of input_a, input_b
-  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
-  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
+  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a,
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims of Input tensor should be the same as the "
+                        "declared dims. Tensor dims: [%s], declared dims: [%s]",
+                        input_a.dims(), dim_a));
+  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b,
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims of Input tensor should be the same as the "
+                        "declared dims. Tensor dims: [%s], declared dims: [%s]",
+                        input_b.dims(), dim_b));
 
   int* out_ptr = nullptr;
   if (paddle::platform::is_gpu_place(Place())) {
@@ -365,10 +413,16 @@ void ConcatCase4(DeviceContext* context) {
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 24; ++j) {
       if (j >= cols) {
-        PADDLE_ENFORCE_EQ(out_ptr[i * 24 + j], b_ptr[idx_b]);
+        PADDLE_ENFORCE_EQ(
+            out_ptr[i * 24 + j], b_ptr[idx_b],
+            paddle::platform::errors::InvalidArgument(
+                "Concat test failed, the result should be equal."));
         ++idx_b;
       } else {
-        PADDLE_ENFORCE_EQ(out_ptr[i * 24 + j], a_ptr[idx_a]);
+        PADDLE_ENFORCE_EQ(
+            out_ptr[i * 24 + j], a_ptr[idx_a],
+            paddle::platform::errors::InvalidArgument(
+                "Concat test failed, the result should be equal."));
         ++idx_a;
       }
     }
diff --git a/paddle/fluid/operators/math/context_project.h b/paddle/fluid/operators/math/context_project.h
index e9019c6d2fe6890ee92cb5a3b047666e3c2a7e04..051c6019d74f7d2820dc0ba668da3cafe8864346 100644
--- a/paddle/fluid/operators/math/context_project.h
+++ b/paddle/fluid/operators/math/context_project.h
@@ -134,7 +134,10 @@ class ContextProjectFunctor {
       }
     }
     if (padding_trainable) {
-      PADDLE_ENFORCE_NOT_NULL(padding_data);
+      PADDLE_ENFORCE_NOT_NULL(
+          padding_data,
+          platform::errors::InvalidArgument(
+              "The input tensor 'padding_data' should not be NULL."));
       for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
         if (lod_level_0[i] == lod_level_0[i + 1]) continue;
 
diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
index 8940a41424b01c975f1264ca309cc09fc3c7ae85..925f3b6161ae8506107f917196e77ecb2d9c5593 100644
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -621,7 +621,10 @@ class VecActivations {
     } else if (type == "identity" || type == "") {
       return vec_identity<T, isa>;
     }
-    PADDLE_THROW("Not support type: %s", type);
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Expected type should be one of sigmod, relu, tanh, identity. But got "
+        "not support type: %s.",
+        type));
   }
 };
 
diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
index c7fac60dd3e663088813f795352e4d751059de39..84fa0d6af990e22083ec1a0e3993893cefad1ab5 100644
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -27,8 +27,8 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
                                    const int ignore_index) {
   CUDA_KERNEL_LOOP(i, N) {
     PADDLE_ENFORCE(label[i] >= 0 && label[i] < D || label[i] == ignore_index,
-                   "label[%d] expected >= 0 and < %ld, or == %ld, but got "
-                   "%ld. Please check input value.",
+                   "The value of label[%d] expected >= 0 and < %ld, or == %ld, "
+                   "but got %ld. Please check input value.",
                    i, D, ignore_index, label[i]);
     Y[i] = ignore_index == label[i]
                ? static_cast<T>(0)
diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index 094a7237826610af574061263e5b0df5eafdf239..6fb393d791cc2a077dbcd0a912bcf31b5d59ad65 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -34,9 +34,16 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col,
                   const DataLayout data_layout) {
-    PADDLE_ENFORCE_EQ(im.dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(im.dims().size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'im' should be 3. But got "
+                          "the dims of tensor 'im' is [%s].",
+                          im.dims()));
     PADDLE_ENFORCE_EQ(col->dims().size(), 5,
-                      "The dimension of col should be 5.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'col' should be 5. But got "
+                          "the dims of tensor 'col' is [%s].",
+                          col->dims()));
 
     if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 &&
         dilation[1] == 1) {
@@ -70,9 +77,16 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* im,
                   const DataLayout data_layout) {
-    PADDLE_ENFORCE_EQ(im->dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(im->dims().size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'im' should be 3. But got "
+                          "the dims of tensor 'im' is [%s].",
+                          im->dims()));
     PADDLE_ENFORCE_EQ(col.dims().size(), 5,
-                      "The dimension of col should be 5.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'col' should be 5. But got "
+                          "the dims of tensor 'col' is [%s].",
+                          col.dims()));
     int im_channels =
         (data_layout != DataLayout::kNHWC ? im->dims()[0] : im->dims()[2]);
     int im_height =
@@ -88,16 +102,16 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                        ((dilation[0] * (filter_height - 1) + 1))) /
                               stride[0] +
                           1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
+                      col_height, platform::errors::InvalidArgument(
+                                      "Output_height and padding(padding_up, "
+                                      "padding_down) are inconsistent."));
     PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
                        ((dilation[1] * (filter_width - 1) + 1))) /
                               stride[1] +
                           1,
-                      col_width,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
+                      col_width, platform::errors::InvalidArgument(
+                                     "Output_height and padding(padding_up, "
+                                     "padding_down) are inconsistent."));
 
     int channels_col = im_channels * filter_height * filter_width;
 
@@ -154,9 +168,16 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col,
                   const DataLayout data_layout) {
-    PADDLE_ENFORCE_EQ(im.dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(im.dims().size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'im' should be 3. But got "
+                          "the dims of tensor 'im' is [%s].",
+                          im.dims()));
     PADDLE_ENFORCE_EQ(col->dims().size(), 5,
-                      "The dimension of col should be 5.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'col' should be 5. But got "
+                          "the dims of tensor 'col' is [%s].",
+                          col->dims()));
     int im_channels = im.dims()[0];
     int im_height = im.dims()[1];
     int im_width = im.dims()[2];
@@ -218,9 +239,16 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* im,
                   const DataLayout data_layout) {
-    PADDLE_ENFORCE_EQ(im->dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(im->dims().size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'im' should be 3. But got "
+                          "the dims of tensor 'im' is [%s].",
+                          im->dims()));
     PADDLE_ENFORCE_EQ(col.dims().size(), 5,
-                      "The dimension of col should be 5.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'col' should be 5. But got "
+                          "the dims of tensor 'col' is [%s].",
+                          col.dims()));
     int im_channels = im->dims()[0];
     int im_height = im->dims()[1];
     int im_width = im->dims()[2];
@@ -231,14 +259,14 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
 
     PADDLE_ENFORCE_EQ(
         (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
-        col_height,
-        "Output_height and padding(padding_up, padding_down) are "
-        "inconsistent.");
+        col_height, platform::errors::InvalidArgument(
+                        "Output_height and padding(padding_up, padding_down) "
+                        "are inconsistent."));
     PADDLE_ENFORCE_EQ(
         (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
         col_width,
-        "col_width and padding(padding_left, padding_right) are "
-        "inconsistent.");
+        platform::errors::InvalidArgument("col_width and padding(padding_left, "
+                                          "padding_right) are inconsistent."));
 
     T* im_data = im->data<T>();
     const T* col_data = col.data<T>();
diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu
index 97719300daed9c02a716f31d853e3a381312961c..f2a2148ba6954f50cf59ae30f4f4be6aa070739f 100644
--- a/paddle/fluid/operators/math/im2col.cu
+++ b/paddle/fluid/operators/math/im2col.cu
@@ -81,9 +81,16 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col,
                   const DataLayout data_layout) {
-    PADDLE_ENFORCE_EQ(im.dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(im.dims().size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'im' should be 3. But got "
+                          "the dims of tensor 'im' is [%s].",
+                          im.dims()));
     PADDLE_ENFORCE_EQ(col->dims().size(), 5,
-                      "The dimension of col should be 5.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'col' should be 5. But got "
+                          "the dims of tensor 'col' is [%s].",
+                          col->dims()));
 
     int im_channels =
         (data_layout != DataLayout::kNHWC ? im.dims()[0] : im.dims()[2]);
@@ -182,9 +189,16 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* im,
                   const DataLayout data_layout) {
-    PADDLE_ENFORCE_EQ(im->dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(im->dims().size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'im' should be 3. But got "
+                          "the dims of tensor 'im' is [%s].",
+                          im->dims()));
     PADDLE_ENFORCE_EQ(col.dims().size(), 5,
-                      "The dimension of col should be 5.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'col' should be 5. But got "
+                          "the dims of tensor 'col' is [%s].",
+                          col.dims()));
 
     int im_channels =
         (data_layout != DataLayout::kNHWC ? im->dims()[0] : im->dims()[2]);
@@ -201,16 +215,16 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                        (dilation[0] * (filter_height - 1) + 1)) /
                               stride[0] +
                           1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
+                      col_height, platform::errors::InvalidArgument(
+                                      "Output_height and padding(padding_up, "
+                                      "padding_down) are inconsistent."));
     PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
                        (dilation[1] * (filter_width - 1) + 1)) /
                               stride[1] +
                           1,
-                      col_width,
-                      "col_width and padding(padding_left, padding_right) are "
-                      "inconsistent.");
+                      col_width, platform::errors::InvalidArgument(
+                                     "col_width and padding(padding_left, "
+                                     "padding_right) are inconsistent."));
 
     size_t num_kernels = im_channels * im_height * im_width;
 
@@ -285,9 +299,16 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col,
                   const DataLayout data_layout) {
-    PADDLE_ENFORCE_EQ(im.dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(im.dims().size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'im' should be 3. But got "
+                          "the dims of tensor 'im' is [%s].",
+                          im.dims()));
     PADDLE_ENFORCE_EQ(col->dims().size(), 5,
-                      "The dimension of col should be 5.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'col' should be 5. But got "
+                          "the dims of tensor 'col' is [%s].",
+                          col->dims()));
 
     int im_channels = im.dims()[0];
     int im_height = im.dims()[1];
@@ -370,9 +391,16 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* im,
                   const DataLayout data_layout) {
-    PADDLE_ENFORCE_EQ(im->dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(im->dims().size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'im' should be 3. But got "
+                          "the dims of tensor 'im' is [%s].",
+                          im->dims()));
     PADDLE_ENFORCE_EQ(col.dims().size(), 5,
-                      "The dimension of col should be 5.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'col' should be 5. But got "
+                          "the dims of tensor 'col' is [%s].",
+                          col.dims()));
 
     int im_channels = im->dims()[0];
     int im_height = im->dims()[1];
@@ -386,16 +414,16 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                        (dilation[0] * (filter_height - 1) + 1)) /
                               stride[0] +
                           1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
+                      col_height, platform::errors::InvalidArgument(
+                                      "Output_height and padding(padding_up, "
+                                      "padding_down) are inconsistent."));
     PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
                        (dilation[1] * (filter_width - 1) + 1)) /
                               stride[1] +
                           1,
-                      col_width,
-                      "col_width and padding(padding_left, padding_right) are "
-                      "inconsistent.");
+                      col_width, platform::errors::InvalidArgument(
+                                     "col_width and padding(padding_left, "
+                                     "padding_right) are inconsistent."));
 
     int block_dim_x = 0;
     int block_dim_y = 0;
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 824e66b1eb4ae05cc74dc1cd8c21f16f286592e6..f44b33fcf2fc23f79483909046dd9e292fd8dde8 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -128,9 +128,23 @@ struct RowwiseAdd<platform::CPUDeviceContext, T> {
                   const framework::Tensor& input,
                   const framework::Tensor& vector, framework::Tensor* output) {
     auto in_dims = input.dims();
+    auto out_dims = output->dims();
     auto size = input.numel() / in_dims[0];
-    PADDLE_ENFORCE_EQ(vector.numel(), size);
-    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+    PADDLE_ENFORCE_EQ(
+        vector.numel(), size,
+        platform::errors::InvalidArgument(
+            "The input vector size"
+            " should be equal to the size of each row of input tensor."
+            " Expected vector size=%d, but received %d",
+            size, vector.numel()));
+    const char* in_dims_cstr = in_dims.to_str().c_str();
+    const char* out_dims_cstr = out_dims.to_str().c_str();
+    PADDLE_ENFORCE_EQ(out_dims, in_dims,
+                      platform::errors::InvalidArgument(
+                          "The output tensor shape should be same as the input"
+                          " tensor shape. Expected output tensor shape: %s,"
+                          " but received %s",
+                          in_dims_cstr, out_dims_cstr));
 
     auto in = framework::EigenMatrix<T>::From(input);
     auto vec = framework::EigenVector<T>::Flatten(vector);
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index fba143d017deb4b4814ad8b10e614357a7ebee23..1c519d226ebfe5ff19876f17b79fd36aa12c4130 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -88,9 +88,24 @@ struct RowwiseAdd<platform::CUDADeviceContext, T> {
                   const framework::Tensor& input,
                   const framework::Tensor& vector, framework::Tensor* output) {
     auto in_dims = input.dims();
+    auto out_dims = output->dims();
     auto size = input.numel() / in_dims[0];
-    PADDLE_ENFORCE_EQ(vector.numel(), size);
-    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+    PADDLE_ENFORCE_EQ(
+        vector.numel(), size,
+        platform::errors::InvalidArgument(
+            "The input vector size"
+            " should be equal to the size of each row of input tensor."
+            " Expected vector size=%d, but received %d",
+            size, vector.numel()));
+    const char* in_dims_cstr = in_dims.to_str().c_str();
+    const char* out_dims_cstr = out_dims.to_str().c_str();
+    PADDLE_ENFORCE_EQ(
+        out_dims, in_dims,
+        platform::errors::InvalidArgument(
+            "The output tensor shape should be same as the input tensor"
+            " shape. Expected output tensor shape: %s,"
+            " but received %s",
+            in_dims_cstr, out_dims_cstr));
     int blocks = 512;
     int grids = (input.numel() + blocks - 1) / blocks;
     RowwiseAddKernel<T><<<grids, blocks, 0, context.stream()>>>(
@@ -113,7 +128,12 @@ void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
     framework::Tensor* vector) {
   auto in_dims = input.dims();
   auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector->numel(), size);
+  PADDLE_ENFORCE_EQ(vector->numel(), size,
+                    platform::errors::InvalidArgument(
+                        "The size of input vector"
+                        " should be equal to the size of input tensor column"
+                        " dimension. Expected vector size=%d, but received %d",
+                        size, vector->numel()));
   framework::Tensor one;
   one.mutable_data<double>({in_dims[0]}, context.GetPlace());
   SetConstant<platform::CUDADeviceContext, double> set;
@@ -134,7 +154,12 @@ void RowwiseSum<platform::CUDADeviceContext, double>::operator()(
     framework::Tensor* vector) {
   auto in_dims = input.dims();
   auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0]);
+  PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0],
+                    platform::errors::InvalidArgument(
+                        "The size of input vector"
+                        " should be equal to the size of input tensor row"
+                        " dimension. Expected vector size=%d, but received %d",
+                        in_dims[0], vector->numel()));
   framework::Tensor one;
   one.mutable_data<double>({size}, context.GetPlace());
   SetConstant<platform::CUDADeviceContext, double> set;
diff --git a/paddle/fluid/operators/math/math_function_impl.h b/paddle/fluid/operators/math/math_function_impl.h
index 693d5620460e1fe6f6d82bd0749b0780b64841f5..869a3054598da9cd2223ca0e705c0f910ba043ec 100644
--- a/paddle/fluid/operators/math/math_function_impl.h
+++ b/paddle/fluid/operators/math/math_function_impl.h
@@ -59,7 +59,12 @@ void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
                                               framework::Tensor* out) {
   auto in_dims = input.dims();
   auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(out->numel(), size);
+  PADDLE_ENFORCE_EQ(out->numel(), size,
+                    platform::errors::InvalidArgument(
+                        "The size of output tensor "
+                        "should be equal to the size of input tensor column"
+                        " dimension. Expected output size=%d, but received %d",
+                        size, out->numel()));
 
   auto in = framework::EigenMatrix<T>::From(input);
   auto vec = framework::EigenVector<T>::Flatten(*out);
@@ -78,7 +83,13 @@ class ColwiseSum<platform::CPUDeviceContext, T> {
     auto& in_dims = input.dims();
     auto height = in_dims[0];
     auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), size);
+    PADDLE_ENFORCE_EQ(
+        out->numel(), size,
+        platform::errors::InvalidArgument(
+            "The size of output tensor "
+            "should be equal to the size of input tensor column"
+            " dimension. Expected output size=%d, but received %d",
+            size, out->numel()));
 
     T* out_buf = out->mutable_data<T>(out->place());
     const T* in_buf = input.data<T>();
@@ -100,8 +111,16 @@ void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
                                                const framework::Tensor& input,
                                                framework::Tensor* out) {
   auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
-  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
+  PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
+                                            "The rank of input tensor "
+                                            "should be 2, but received %d",
+                                            in_dims.size()));
+  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0],
+                    platform::errors::InvalidArgument(
+                        "The size of output tensor "
+                        "should be equal to the size of input tensor row"
+                        " dimension. Expected output size=%d, but received %d",
+                        in_dims[0], out->numel()));
 
   auto in = framework::EigenMatrix<T>::From(input);
   auto vec = framework::EigenVector<T>::Flatten(*out);
@@ -118,10 +137,19 @@ class RowwiseMean<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, framework::Tensor* out) {
     auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
+                                              "The rank of input tensor "
+                                              "should be 2, but received %d",
+                                              in_dims.size()));
     auto height = in_dims[0];
     auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), height);
+    PADDLE_ENFORCE_EQ(
+        out->numel(), height,
+        platform::errors::InvalidArgument(
+            "The size of output tensor "
+            "should be equal to the size of input tensor row"
+            " dimension. Expected output size=%d, but received %d",
+            height, out->numel()));
     auto inv_size = 1.0 / size;
     T* out_buf = out->mutable_data<T>(out->place());
     const T* in_buf = input.data<T>();
@@ -141,8 +169,16 @@ void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
                                               const framework::Tensor& input,
                                               framework::Tensor* out) {
   auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
-  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
+  PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
+                                            "The rank of input tensor "
+                                            "should be 2, but received %d",
+                                            in_dims.size()));
+  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0],
+                    platform::errors::InvalidArgument(
+                        "The size of output tensor "
+                        "should be equal to the size of input tensor row"
+                        " dimension. Expected output size=%d, but received %d",
+                        in_dims[0], out->numel()));
 
   auto in = framework::EigenMatrix<T>::From(input);
   auto vec = framework::EigenVector<T>::Flatten(*out);
@@ -159,10 +195,19 @@ class RowwiseSum<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, framework::Tensor* out) {
     auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
+                                              "The rank of input tensor "
+                                              "should be 2, but received %d",
+                                              in_dims.size()));
     auto height = in_dims[0];
     auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), height);
+    PADDLE_ENFORCE_EQ(
+        out->numel(), height,
+        platform::errors::InvalidArgument(
+            "The size of output tensor "
+            "should be equal to the size of input tensor row"
+            " dimension. Expected output size=%d, but received %d",
+            height, out->numel()));
 
     T* out_buf = out->mutable_data<T>(out->place());
     const T* in_buf = input.data<T>();
diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc
index 2343e0ee965303c9fdb2ad3faf9ddf6e5bb7782f..587823e535ac67f926fd469d2f43df536c8c88b6 100644
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -224,7 +224,11 @@ TEST(math_funciton, set_constant) {
   auto* ctx = new paddle::platform::CPUDeviceContext();
   paddle::operators::math::set_constant(*ctx, &t, 10);
   for (int64_t i = 0; i < t.numel(); ++i) {
-    PADDLE_ENFORCE_EQ(10, t.data<int>()[i]);
+    PADDLE_ENFORCE_EQ(10, t.data<int>()[i],
+                      paddle::platform::errors::InvalidArgument(
+                          "Each value of input"
+                          "tensor should be 10, but received %d.",
+                          t.data<int>()[i]));
   }
   delete ctx;
 }
diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu
index bcbb4a8274f149240b9f0990f38d9f38bdd0e5b1..44b1ee45a4fe9b6f2ea7ba5e09c7cbc60c1aff28 100644
--- a/paddle/fluid/operators/math/math_function_test.cu
+++ b/paddle/fluid/operators/math/math_function_test.cu
@@ -18,7 +18,12 @@
 
 void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size,
                     const std::vector<float>& data) {
-  PADDLE_ENFORCE_EQ(size, data.size());
+  PADDLE_ENFORCE_EQ(
+      size, data.size(),
+      paddle::platform::errors::InvalidArgument(
+          "The size of argument data should"
+          " be equal to the argument size. Expected %d, but received %d.",
+          size, data.size()));
   for (size_t i = 0; i < data.size(); ++i) {
     in_ptr[i] = paddle::platform::float16(data[i]);
   }
diff --git a/paddle/fluid/operators/math/padding.h b/paddle/fluid/operators/math/padding.h
index 63f793433de07ea2e43ad03ea3ccae1a259f7ae2..379b21c3c18888989663221052e6e99df80e7e9d 100644
--- a/paddle/fluid/operators/math/padding.h
+++ b/paddle/fluid/operators/math/padding.h
@@ -85,8 +85,9 @@ void PaddingFunctor(int rank, const framework::ExecutionContext& context,
       PadFunction<DeviceContext, T, 6>(context, pads, src, pad_value, out);
       break;
     default:
-      PADDLE_THROW(
-          "PadOp only support tensors with no more than 6 dimensions.");
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "PadOp only support tensors with no more"
+          " than 6 dimensions currently."));
   }
 }
 
@@ -114,8 +115,9 @@ void PaddingGradFunctor(int rank, const framework::ExecutionContext& context,
       PadGradFunction<DeviceContext, T, 6>(context, pads, src, out);
       break;
     default:
-      PADDLE_THROW(
-          "PadOp only support tensors with no more than 6 dimensions.");
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "PadOp only support tensors with no more"
+          " than 6 dimensions currently."));
   }
 }
 
diff --git a/paddle/fluid/operators/math/sampler.h b/paddle/fluid/operators/math/sampler.h
index 480576ef9dc8c21811a1a867d553ccc6d97fa22a..de9113f2bb616b489747d8d960154f55bb988847 100644
--- a/paddle/fluid/operators/math/sampler.h
+++ b/paddle/fluid/operators/math/sampler.h
@@ -19,6 +19,8 @@ limitations under the License. */
 #include <random>
 #include <vector>
 
+#include "paddle/fluid/platform/enforce.h"
+
 namespace paddle {
 namespace operators {
 namespace math {
@@ -31,7 +33,10 @@ namespace math {
 class Sampler {
  public:
   explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) {
-    //    PADDLE_ENFORCE_GT(range, 0, "Range should be greater than 0.");
+    PADDLE_ENFORCE_GT(range, 0, platform::errors::InvalidArgument(
+                                    "Range should be"
+                                    " greater than 0, but recevied %d.",
+                                    range));
     if (seed == 0) {
       std::random_device r;
       seed_ = r();
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index 3bb9efc5315fcacf0b50682b65c89ac3ad0d2d4e..c2595beb0cb4dc37104a91ac8a2647c7d787c5c5 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -29,7 +29,12 @@ struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
                   const framework::SelectedRows& input2,
                   framework::SelectedRows* output) {
     auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2.height());
+    PADDLE_ENFORCE_EQ(
+        in1_height, input2.height(),
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height  = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, input2.height()));
     output->set_height(in1_height);
 
     auto& in1_rows = input1.rows();
@@ -47,15 +52,31 @@ struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
     auto& in2_value = input2.value();
 
     auto in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
-    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, in2_value.numel() / in2_rows.size(),
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, in2_value.numel() / in2_rows.size()));
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, out_value->numel() / out_rows.size(),
+        platform::errors::InvalidArgument(
+            "The input and oupput width must be equal."
+            "But recieved input width = [%d], output width = [%d]",
+            in1_row_numel, out_value->numel() / out_rows.size()));
 
     auto in1_place = input1.place();
-    PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
+    PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the CPU place."));
     auto in2_place = input2.place();
-    PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
+    PADDLE_ENFORCE_EQ(platform::is_cpu_place(in2_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the CPU place."));
     auto out_place = context.GetPlace();
-    PADDLE_ENFORCE(platform::is_cpu_place(out_place));
+    PADDLE_ENFORCE_EQ(platform::is_cpu_place(out_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the CPU place."));
 
     auto* out_data = out_value->data<T>();
     auto* in1_data = in1_value.data<T>();
@@ -82,15 +103,35 @@ struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
     auto in1_height = input1.height();
     auto in2_dims = input2.dims();
     auto out_dims = output->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
-    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, in2_dims[0]));
+    PADDLE_ENFORCE_EQ(
+        in1_height, out_dims[0],
+        platform::errors::InvalidArgument(
+            "The input and output height must be equal."
+            "But recieved input height = [%d], output height = [%d]",
+            in1_height, out_dims[0]));
 
     auto& in1_value = input1.value();
     auto& in1_rows = input1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
-    PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2.numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2.numel() / in1_height));
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, output->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The input and output width must be equal."
+            "But recieved input width = [%d], output width = [%d]",
+            in1_row_numel, output->numel() / in1_height));
 
     SetConstant<platform::CPUDeviceContext, T> functor;
     functor(context, output, 0.0);
@@ -121,7 +162,12 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
                   const int64_t input2_offset,
                   framework::SelectedRows* input2) {
     auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2->height());
+    PADDLE_ENFORCE_EQ(
+        in1_height, input2->height(),
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, input2->height()));
 
     auto& in1_rows = input1.rows();
     auto& in2_rows = *(input2->mutable_rows());
@@ -133,9 +179,13 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
     in2_rows.Extend(in1_rows.begin(), in1_rows.end());
 
     auto in1_place = input1.place();
-    PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
+    PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the CPU place."));
     auto in2_place = input2->place();
-    PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
+    PADDLE_ENFORCE_EQ(platform::is_cpu_place(in2_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the CPU place."));
 
     auto* in1_data = in1_value.data<T>();
     auto* in2_data = in2_value->data<T>();
@@ -163,7 +213,12 @@ struct SelectedRowsSumTo<platform::CPUDeviceContext, T> {
       auto& in_rows = (*iter)->rows();
       size += in_rows.end() - in_rows.begin();
       auto in1_height = (*iter)->height();
-      PADDLE_ENFORCE_EQ(in1_height, input2->height());
+      PADDLE_ENFORCE_EQ(in1_height, input2->height(),
+                        platform::errors::InvalidArgument(
+                            "The two inputs height must be equal."
+                            "But recieved first input height = [%d], second "
+                            "input height = [%d]",
+                            in1_height, input2->height()));
     }
     // concat rows
     std::vector<int64_t> in2_rows;
@@ -201,13 +256,23 @@ struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
     }
     auto in1_height = input1.height();
     auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, in2_dims[0]));
 
     auto& in1_value = input1.value();
     auto& in1_rows = input1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
     auto* input2_data = input2->data<T>();
@@ -302,10 +367,12 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
         continue;
       }
       PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
-                        "all input should have same "
-                        "dimension except for the first one");
+                        platform::errors::InvalidArgument(
+                            "All inputs should have same "
+                            "dimension except for the first one."));
       PADDLE_ENFORCE_EQ(input_height, input->height(),
-                        "all input should have same height");
+                        platform::errors::InvalidArgument(
+                            "All inputs should have same height."));
       row_num += input->rows().size();
       merged_row_set.insert(input->rows().begin(), input->rows().end());
     }
@@ -421,10 +488,12 @@ struct MergeAverage<platform::CPUDeviceContext, T> {
         continue;
       }
       PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
-                        "all input should have same "
-                        "dimension except for the first one");
+                        platform::errors::InvalidArgument(
+                            "All inputs should have same "
+                            "dimension except for the first one."));
       PADDLE_ENFORCE_EQ(input_height, input->height(),
-                        "all input should have same height");
+                        platform::errors::InvalidArgument(
+                            "All input should have same height."));
       row_num += input->rows().size();
       merged_row_set.insert(input->rows().begin(), input->rows().end());
     }
@@ -492,13 +561,23 @@ struct UpdateToTensor<platform::CPUDeviceContext, T> {
                   framework::Tensor* input2) {
     auto in1_height = input1.height();
     auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, in2_dims[0]));
 
     auto& in1_value = input1.value();
     auto& in1_rows = input1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
     auto* input2_data = input2->data<T>();
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index 9cce52c6d4587baf01ba22eebc9c57da04c26590..35bd02ad35b71eb7deb3299490fa545ef8b23dc6 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -30,7 +30,12 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
                   const framework::SelectedRows& input2,
                   framework::SelectedRows* output) {
     auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2.height());
+    PADDLE_ENFORCE_EQ(
+        in1_height, input2.height(),
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height  = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, input2.height()));
     output->set_height(in1_height);
 
     framework::Vector<int64_t> in1_rows(input1.rows());
@@ -48,18 +53,34 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
     auto& in2_value = input2.value();
 
     auto in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
-    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, in2_value.numel() / in2_rows.size(),
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, in2_value.numel() / in2_rows.size()));
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, out_value->numel() / out_rows.size(),
+        platform::errors::InvalidArgument(
+            "The input and oupput width must be equal."
+            "But recieved input width = [%d], output width = [%d]",
+            in1_row_numel, out_value->numel() / out_rows.size()));
 
     auto* out_data = out_value->data<T>();
     auto* in1_data = in1_value.data<T>();
 
     auto in1_place = input1.place();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true);
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the GPU place."));
     auto in2_place = input2.place();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true);
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the GPU place."));
     auto out_place = context.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_place), true);
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the GPU place."));
 
     memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, out_place), out_data,
                  BOOST_GET_CONST(platform::CUDAPlace, in1_place), in1_data,
@@ -104,15 +125,35 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
     auto in1_height = input1.height();
     auto in2_dims = input2.dims();
     auto out_dims = output->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
-    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument(
+            "The two inputs height must be equal."
+            "But recieved first input height = [%d], first input height = [%d]",
+            in1_height, in2_dims[0]));
+    PADDLE_ENFORCE_EQ(
+        in1_height, out_dims[0],
+        platform::errors::InvalidArgument(
+            "The input and output height must be equal."
+            "But recieved input height = [%d], output height = [%d]",
+            in1_height, out_dims[0]));
 
     auto& in1_value = input1.value();
     auto& in1_rows = input1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
-    PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2.numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2.numel() / in1_height));
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, output->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The input and output width must be equal."
+            "But recieved input width = [%d], output width = [%d]",
+            in1_row_numel, output->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
     auto* in2_data = input2.data<T>();
@@ -148,7 +189,12 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
                   const int64_t input2_offset,
                   framework::SelectedRows* input2) {
     auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2->height());
+    PADDLE_ENFORCE_EQ(
+        in1_height, input2->height(),
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, input2->height()));
 
     auto& in1_rows = input1.rows();
     auto& in2_rows = *(input2->mutable_rows());
@@ -162,9 +208,13 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
     }
 
     auto in1_place = input1.place();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true);
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the GPU place."));
     auto in2_place = input2->place();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true);
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the GPU place."));
 
     auto* in1_data = in1_value.data<T>();
     auto* in2_data = in2_value->data<T>();
@@ -209,13 +259,23 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
                   framework::Tensor* input2) {
     auto in1_height = input1.height();
     auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, in2_dims[0]));
 
     auto& in1_value = input1.value();
     auto& in1_rows = input1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
     auto* in2_data = input2->data<T>();
@@ -340,10 +400,12 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
         continue;
       }
       PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
-                        "all input should have same "
-                        "dimension except for the first one");
+                        platform::errors::InvalidArgument(
+                            "All input should have same "
+                            "dimension except for the first one."));
       PADDLE_ENFORCE_EQ(input_height, input->height(),
-                        "all input should have same height");
+                        platform::errors::InvalidArgument(
+                            "All input should have same height."));
       merged_row_set.insert(input->rows().begin(), input->rows().end());
     }
     std::vector<int64_t> merge_rows_cpu(merged_row_set.begin(),
@@ -448,13 +510,23 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
 
     auto in1_height = merged_in1.height();
     auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, in2_dims[0]));
 
     auto& in1_value = merged_in1.value();
     auto& in1_rows = merged_in1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.template data<T>();
     auto* in2_data = input2->data<T>();
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
index 74892316e6decdeab3a08396fa2f4bdeb8eb7b73..81ad620466ee3d9fcd9d3e057cfd0dd9053089f0 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
@@ -38,7 +38,9 @@ TEST(selected_rows_functor, gpu_add) {
           {static_cast<int64_t>(rows1.size()), row_numel}),
       gpu_place);
   functor(ctx, in1_value, 1.0);
-  PADDLE_ENFORCE(cudaDeviceSynchronize());
+  PADDLE_ENFORCE_EQ(cudaDeviceSynchronize(), 0,
+                    paddle::platform::errors::PreconditionNotMet(
+                        "The all synchronization on the cuda is error!"));
 
   std::vector<int64_t> rows2{0, 5, 7, 9};
   std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
diff --git a/paddle/fluid/operators/math/sequence2batch.cc b/paddle/fluid/operators/math/sequence2batch.cc
index e4ffeedb5a0061dd60ca3a30aa9928ef8b05887c..300a3692012ab9631d7049d2042e91fb99ad3c21 100644
--- a/paddle/fluid/operators/math/sequence2batch.cc
+++ b/paddle/fluid/operators/math/sequence2batch.cc
@@ -29,11 +29,24 @@ class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
     auto src_dims = src.dims();
     auto dst_dims = dst->dims();
     PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
-                      "The src must be matrix with rank 2.");
+                      platform::errors::InvalidArgument(
+                          "The source tensor must be a matrix with rank 2, but "
+                          "got the source tensor rank is %lu. "
+                          "Please check the rank of the source tensor",
+                          src_dims.size()));
     PADDLE_ENFORCE_EQ(dst_dims.size(), 2UL,
-                      "The dst must be matrix with rank 2.");
-    PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1],
-                      "The width of src and dst must be same.");
+                      platform::errors::InvalidArgument(
+                          "The destination tensor must be a matrix with rank, "
+                          "but got the destination tensor rank is %lu. "
+                          "Please check the rank of the destination tensor",
+                          dst_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        src_dims[1], dst_dims[1],
+        platform::errors::InvalidArgument(
+            "The width of the source tensor and the destination tensor must be "
+            "same. But got %lu != %lu.Please check the rank of the source "
+            "tensor",
+            src_dims.size(), dst_dims.size()));
     auto height = dst_dims[0];
     auto width = dst_dims[1];
     auto* src_data = src.data<T>();
diff --git a/paddle/fluid/operators/math/sequence2batch.cu b/paddle/fluid/operators/math/sequence2batch.cu
index 9ab13659c1cc5b59d28395bcebcfb43fac5b4544..cd1ca572689bc701da801384e5ed08fe6dc10749 100644
--- a/paddle/fluid/operators/math/sequence2batch.cu
+++ b/paddle/fluid/operators/math/sequence2batch.cu
@@ -46,11 +46,24 @@ class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
     auto src_dims = src.dims();
     auto dst_dims = dst->dims();
     PADDLE_ENFORCE_EQ(src_dims.size(), 2,
-                      "The src must be matrix with rank 2.");
+                      platform::errors::InvalidArgument(
+                          "The source tensor must be a matrix with rank 2, but "
+                          "got the source tensor rank is %lu. "
+                          "Please check the rank of the source tensor",
+                          src_dims.size()));
     PADDLE_ENFORCE_EQ(dst_dims.size(), 2,
-                      "The dst must be matrix with rank 2.");
-    PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1],
-                      "The width of src and dst must be same.");
+                      platform::errors::InvalidArgument(
+                          "The destination tensor must be a matrix with rank, "
+                          "but got the destination tensor rank is %lu. "
+                          "Please check the rank of the destination tensor",
+                          dst_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        src_dims[1], dst_dims[1],
+        platform::errors::InvalidArgument(
+            "The width of the source tensor and the destination tensor must be "
+            "same. But got %lu != %lu.Please check the rank of the source "
+            "tensor",
+            src_dims.size(), dst_dims.size()));
     auto height = dst_dims[0];
     auto width = dst_dims[1];
     auto* src_data = src.data<T>();
diff --git a/paddle/fluid/operators/math/sequence2batch.h b/paddle/fluid/operators/math/sequence2batch.h
index 9d9f7ef00b8a12088225fd3620cb30b43ef9dce9..6aa513e4d10eef49c02417e98b31cddd57088d7c 100644
--- a/paddle/fluid/operators/math/sequence2batch.h
+++ b/paddle/fluid/operators/math/sequence2batch.h
@@ -64,19 +64,30 @@ class LoDTensor2BatchFunctor {
                   bool is_reverse = false) const {
     if (!is_cal_batch_lod) {
       auto lods = batch->lod();
-      PADDLE_ENFORCE_GT(lods.size(), 2UL,
-                        "The LoD of LoDTensor should inlcude at least 2-level "
-                        "sequence information.");
+      PADDLE_ENFORCE_GT(
+          lods.size(), 2UL,
+          platform::errors::InvalidArgument(
+              "The LoD of LoDTensor should inlcude at least 2-level "
+              "sequence information, but got the LoD level is %lu. Please "
+              "check the input value.",
+              lods.size()));
       PADDLE_ENFORCE_EQ(
           lods[1].size(), static_cast<size_t>(lod_tensor.dims()[0]),
-          "The LoD information should be consistent with the dims.");
+          platform::errors::InvalidArgument(
+              "The LoD information should be consistent with the dims, but got "
+              "%lu != %lu. Please check the input value.",
+              lods[1].size(), static_cast<size_t>(lod_tensor.dims()[0])));
       CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
       to_batch(context, lod_tensor, lods[1], batch, true);
       return;
     }
 
     auto lods = lod_tensor.lod();
-    PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(lods.size(), 1UL,
+                      platform::errors::InvalidArgument(
+                          "Only support one level sequence now, but got the "
+                          "LoD level is %lu. Please check the input value.",
+                          lods.size()));
 
     const auto& lod = lods[0];
 
@@ -161,12 +172,19 @@ class Batch2LoDTensorFunctor {
                   const framework::LoDTensor& batch,
                   framework::LoDTensor* lod_tensor) const {
     auto in_lod = batch.lod();
-    PADDLE_ENFORCE_GT(in_lod.size(), 2UL,
-                      "The LoD of LoDTensor should inlcude at least 2-level "
-                      "sequence information.");
+    PADDLE_ENFORCE_GT(
+        in_lod.size(), 2UL,
+        platform::errors::InvalidArgument(
+            "The LoD of LoDTensor should inlcude at least 2-level "
+            "sequence information, but got the LoD level is %lu. Please check "
+            "the input value.",
+            in_lod.size()));
     PADDLE_ENFORCE_EQ(
         in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0]),
-        "The LoD information should be consistent with the dims.");
+        platform::errors::InvalidArgument(
+            "The LoD information should be consistent with the dims, but got "
+            "%lu != %lu. Please check the input value.",
+            in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0])));
     CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
     to_seq(context, batch, in_lod[1], lod_tensor, false);
   }
diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc
index 4630689dec160da145e607f662a802444ac98b55..076df0176429c7bbd350698af0137fbcca18f806 100644
--- a/paddle/fluid/operators/math/sequence_padding.cc
+++ b/paddle/fluid/operators/math/sequence_padding.cc
@@ -35,7 +35,11 @@ void CopyValidData(framework::Tensor* dst_tensor,
     int valid_seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
     PADDLE_ENFORCE_GE(
         pad_seq_len, valid_seq_len,
-        "The padded sequence length can not be less than its original length.");
+        platform::errors::InvalidArgument(
+            "The padded sequence length can not "
+            "be less than its original length. Expected %ld >= %ld, but got "
+            "%ld < %ld. Please check input value.",
+            pad_seq_len, valid_seq_len, pad_seq_len, valid_seq_len));
     int seq_data_offset = seq_offsets[seq_idx] * step_width;
     int pad_data_offset = layout == kBatchLengthWidth
                               ? seq_idx * pad_seq_len * step_width
@@ -95,9 +99,14 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
 
     CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
               step_width, layout);
-    PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width,
-                   "The numel of 'pad_value' can only be 1 or be equal to the "
-                   "'step_width'.");
+
+    PADDLE_ENFORCE_EQ(
+        pad_value.numel() == 1 || pad_value.numel() == step_width, true,
+        platform::errors::InvalidArgument(
+            "The numel of 'pad_value' can only be 1 or be equal to the "
+            "'step_width', but got %ld != 1 and %ld. Please check the input "
+            "value.",
+            pad_value.numel(), step_width));
 
     // fill padding value
     T* pad_data = pad_tensor->data<T>();
diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu
index 1b433067900af71bb8a6833cef019d41f9c76858..19c3af03411b8ce95d274532707c7ee3e93f1d55 100644
--- a/paddle/fluid/operators/math/sequence_padding.cu
+++ b/paddle/fluid/operators/math/sequence_padding.cu
@@ -66,17 +66,25 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
     if (pad_seq_len == -1) {
       pad_seq_len = max_seq_len;
     }
-    PADDLE_ENFORCE_GE(pad_seq_len, max_seq_len,
-                      "The pad_seq_len must be equal to or greater than the "
-                      "original max sequence length.");
+    PADDLE_ENFORCE_GE(
+        pad_seq_len, max_seq_len,
+        platform::errors::InvalidArgument(
+            "The pad_seq_len must be equal to or greater than the "
+            "original max sequence length. Expected %ld >= %ld, but got %ld < "
+            "%ld. Please check the input value.",
+            pad_seq_len, max_seq_len, pad_seq_len, max_seq_len));
     int step_width = seq_tensor.numel() / seq_tensor_dims[0];
     int seq_num = seq_offsets.size() - 1;
 
     CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
               step_width, layout);
-    PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width,
-                   "The numel of 'pad_value' can only be 1 or be equal to the "
-                   "'step_width'.");
+    PADDLE_ENFORCE_EQ(
+        pad_value.numel() == 1 || pad_value.numel() == step_width, true,
+        platform::errors::InvalidArgument(
+            "The numel of 'pad_value' can only be 1 or be equal to "
+            "the 'step_width', but got %ld != 1 and %ld. Please check the "
+            "input value.",
+            pad_value.numel(), step_width));
 
     const int kBlockSize = 512;
 
diff --git a/paddle/fluid/operators/math/sequence_padding.h b/paddle/fluid/operators/math/sequence_padding.h
index 5580ee5374658c3b7b8e31962cd50f1d72113ba0..956a4ff6a2d45cb619183f9beba1b7e35b7f229c 100644
--- a/paddle/fluid/operators/math/sequence_padding.h
+++ b/paddle/fluid/operators/math/sequence_padding.h
@@ -52,14 +52,25 @@ inline static void CheckDims(const framework::DDim& seq_tensor_dims,
                              const framework::Vector<size_t>& seq_offset,
                              int64_t padded_seq_len, int64_t step_width,
                              const PadLayout& layout) {
-  PADDLE_ENFORCE_EQ(static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back(),
-                    "Value of 1st dimension of the sequence tensor should be "
-                    "equal to sum of lengths of all sequences.");
+  PADDLE_ENFORCE_EQ(
+      static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back(),
+      platform::errors::InvalidArgument(
+          "Value of 1st dimension of the sequence tensor should be "
+          "equal to sum of lengths of all sequences. Expected %ld == %ld, but "
+          "got %ld != %ld. Please check the input value.",
+          static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back(),
+          static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back()));
 
-  PADDLE_ENFORCE(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
-                     seq_tensor_dims.size() == pad_tensor_dims.size(),
-                 "pad_tensor's rank should be 1 greater than seq_tensor's "
-                 "rank, or be equal with it.");
+  PADDLE_ENFORCE_EQ(
+      seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
+          seq_tensor_dims.size() == pad_tensor_dims.size(),
+      true, platform::errors::InvalidArgument(
+                "pad_tensor's rank should be 1 greater than seq_tensor's "
+                "rank, or be equal with it. The pad_tensor's rank is %ld, "
+                "expected the seq_tensor's rank is %ld or %ld, but got %ld. "
+                "Please check the input value.",
+                pad_tensor_dims.size(), pad_tensor_dims.size(),
+                pad_tensor_dims.size() - 1, seq_tensor_dims.size()));
 }
 
 /*
diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc
index cc3fbd587668b17b7edde50b157adca83e81eddc..2eee4d0a6c14e8b6134b71294745c71302450347 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -42,15 +42,29 @@ class MaxSeqPoolFunctor {
     auto out_dims = output->dims();
     auto idx_dims = index->dims();
     PADDLE_ENFORCE_GT(in_dims.size(), 1,
-                      "The rank of input shall be greater than 1.");
+                      platform::errors::InvalidArgument(
+                          "The rank of input shall be greater than 1, but got "
+                          "the rank is %ld. Please check the input value",
+                          in_dims.size()));
     PADDLE_ENFORCE_GT(out_dims.size(), 1,
-                      "The rank of output shall be greater than 1.");
+                      platform::errors::InvalidArgument(
+                          "The rank of output shall be greater than 1, but got "
+                          "the rank is %ld. Please check the input value",
+                          out_dims.size()));
     for (int64_t i = 1; i < in_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i],
-                        "The dimension of input and output shall be same.");
+      PADDLE_ENFORCE_EQ(
+          in_dims[i], out_dims[i],
+          platform::errors::InvalidArgument(
+              "The dimension of input and output shall be same. Expected %ld "
+              "== %ld, but got %ld != %ld. Please check the input value.",
+              in_dims[i], out_dims[i], in_dims[i], out_dims[i]));
     }
-    PADDLE_ENFORCE_EQ(idx_dims, out_dims,
-                      "The dimension of index and output shall be same.");
+    PADDLE_ENFORCE_EQ(
+        idx_dims, out_dims,
+        platform::errors::InvalidArgument(
+            "The dimension of index and output shall be same. Expected %ld == "
+            "%ld, but got %ld != %ld. Please check the input value.",
+            idx_dims, out_dims, idx_dims, out_dims));
 
     auto lod_level = input.lod().size();
     auto starts = input.lod()[lod_level - 1];
@@ -94,12 +108,22 @@ class MaxSeqPoolFunctor<T, true> {
     auto in_dims = input.dims();
     auto out_dims = output->dims();
     PADDLE_ENFORCE_GT(in_dims.size(), 1,
-                      "The rank of input shall be greater than 1.");
+                      platform::errors::InvalidArgument(
+                          "The rank of input shall be greater than 1, but got "
+                          "%ld <= 1. Please check the input value.",
+                          in_dims.size()));
     PADDLE_ENFORCE_GT(out_dims.size(), 1,
-                      "The rank of output shall be greater than 1.");
+                      platform::errors::InvalidArgument(
+                          "The rank of output shall be greater than 1, but got "
+                          "%ld <= 1. Please check the input value.",
+                          out_dims.size()));
     for (int64_t i = 1; i < in_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i],
-                        "The dimension of input and output shall be same.");
+      PADDLE_ENFORCE_EQ(
+          in_dims[i], out_dims[i],
+          platform::errors::InvalidArgument(
+              "The dimension of input and output shall be same. Expected %ld "
+              "== %ld, but got %ld != %ld. Please check the input value.",
+              in_dims[i], out_dims[i], in_dims[i], out_dims[i]));
     }
 
     auto lod_level = input.lod().size();
@@ -139,16 +163,29 @@ class MaxSeqPoolGradFunctor {
     auto ig_dims = in_grad->dims();
     auto idx_dims = index.dims();
     PADDLE_ENFORCE_GT(og_dims.size(), 1,
-                      "The rank of output@Grad shall be greater than 1.");
+                      platform::errors::InvalidArgument(
+                          "The rank of output@Grad shall be greater than 1, "
+                          "but got %ld <= 1. Please check the input value.",
+                          og_dims.size()));
     PADDLE_ENFORCE_GT(ig_dims.size(), 1,
-                      "The rank of input@Grad shall be greater than 1.");
+                      platform::errors::InvalidArgument(
+                          "The rank of input@Grad shall be greater than 1, but "
+                          "got %ld <= 1. Please check the input value.",
+                          ig_dims.size()));
     for (int64_t i = 1; i < og_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(
-          og_dims[i], ig_dims[i],
-          "The dimension of input@Grad and output@Grad shall be same.");
+      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i],
+                        platform::errors::InvalidArgument(
+                            "The dimension of input@Grad and output@Grad shall "
+                            "be same. Expected %ld == %ld, but got %ld != %ld. "
+                            "Please check the input value.",
+                            og_dims[i], ig_dims[i], og_dims[i], ig_dims[i]));
     }
-    PADDLE_ENFORCE_EQ(idx_dims, og_dims,
-                      "The dimension of index and output@Grad shall be same.");
+    PADDLE_ENFORCE_EQ(
+        idx_dims, og_dims,
+        platform::errors::InvalidArgument(
+            "The dimension of index and output@Grad shall be same. Expected "
+            "%ld == %ld, but got %ld != %ld. Please check the input value.",
+            idx_dims, og_dims, idx_dims, og_dims));
 
     const T* og_data = out_grad.data<T>();
     const int* max_index = index.data<int>();
@@ -244,9 +281,12 @@ class SumSeqPoolGradFunctor {
     auto lod = in_grad->lod()[lod_level - 1];
     int64_t out_w = out_grad.numel() / out_grad.dims()[0];
     int64_t in_w = in_grad->numel() / in_grad->dims()[0];
-    PADDLE_ENFORCE_EQ(
-        in_w, out_w,
-        "The feature size of input@Grad and output@Grad shall be same.");
+    PADDLE_ENFORCE_EQ(in_w, out_w,
+                      platform::errors::InvalidArgument(
+                          "The feature size of input@Grad and output@Grad "
+                          "shall be same. Expected %ld == %ld, but got %ld != "
+                          "%ld. Please check the input value.",
+                          in_w, out_w, in_w, out_w));
     const T* out_g_data = out_grad.data<T>();
     T* in_g_data = in_grad->mutable_data<T>(context.GetPlace());
     auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
@@ -298,7 +338,8 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
       auto place = context.GetPlace();
       PADDLE_ENFORCE_EQ(
           platform::is_cpu_place(place), true,
-          "Sequence_pool should run on CPU Device when pooltype is SUM");
+          platform::errors::InvalidArgument(
+              "Sequence_pool should run on CPU Device when pooltype is SUM"));
       const T* src = input.data<T>();
       T* dst = output->mutable_data<T>(place);
       jit::seq_pool_attr_t attr(
@@ -342,7 +383,10 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
         out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
                               std::sqrt(static_cast<T>(h));
       } else {
-        PADDLE_THROW("unsupported pooling pooltype");
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "unsupported pooling pooltype: %s. Only support \"AVERAGE\" and "
+            "\"SQRT\"",
+            pooltype));
       }
     }
   }
@@ -400,7 +444,10 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
       } else if (pooltype == "FIRST") {
         in_g_e.chip(0, 0).device(place) = out_g_e_v;
       } else {
-        PADDLE_THROW("unsupported pooling pooltype");
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "unsupported pooling pooltype: %s. Only support \"AVERAGE\", "
+            "\"SQRT\", \"LAST\" and \"FIRST\"",
+            pooltype));
       }
     }
   }
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
index 422b06c70eb2107659666edf58223ae8e4666b1d..cba8dd935ef1b3625f5f68578e411aec81eaa4f4 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -205,7 +205,10 @@ class SequencePoolFunctor<platform::CUDADeviceContext, T> {
           lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
           output->mutable_data<T>(context.GetPlace()), nullptr);
     } else {
-      PADDLE_THROW("unsupported pooling pooltype");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "unsupported pooling pooltype: %s. Only support \"MAX\", "
+          "\"AVERAGE\", \"SUM\", \"SQRT\", \"LAST\" and \"FIRST\"",
+          pooltype));
     }
   }
 };
@@ -370,7 +373,10 @@ class SequencePoolGradFunctor<platform::CUDADeviceContext, T> {
           in_grad->mutable_data<T>(context.GetPlace()), nullptr);
 
     } else {
-      PADDLE_THROW("unsupported pooling pooltype");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "unsupported pooling pooltype: %s. Only support \"MAX\", "
+          "\"AVERAGE\", \"SUM\", \"SQRT\", \"LAST\" and \"FIRST\"",
+          pooltype));
     }
   }
 };
diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc
index efab1a375b56bea3caec2c8169dc390298a37cbe..4b5f484e52c6acb2d7fb2cea6265c8dd7826571b 100644
--- a/paddle/fluid/operators/math/sequence_pooling_test.cc
+++ b/paddle/fluid/operators/math/sequence_pooling_test.cc
@@ -50,9 +50,21 @@ void TestSequencePoolingSum(const DeviceContext &context,
   in_grad.mutable_data<T>(in_dims, place);
 
   // check tensor contruction result
-  PADDLE_ENFORCE_EQ(in_grad.dims().size(), out_grad.dims().size());
+  PADDLE_ENFORCE_EQ(
+      in_grad.dims().size(), out_grad.dims().size(),
+      paddle::platform::errors::InvalidArgument(
+          "The dimension of input and output shall be same. Expected %ld == "
+          "%ld, but got %ld != %ld. Please check the input value.",
+          in_grad.dims().size(), out_grad.dims().size(), in_grad.dims().size(),
+          out_grad.dims().size()));
   for (int64_t i = 1; i < out_grad.dims().size(); ++i) {
-    PADDLE_ENFORCE_EQ(in_grad.dims()[i], out_grad.dims()[i]);
+    PADDLE_ENFORCE_EQ(
+        in_grad.dims()[i], out_grad.dims()[i],
+        paddle::platform::errors::InvalidArgument(
+            "The dimension of input and output shall be same. Expected %ld == "
+            "%ld, but got %ld != %ld. Please check the input value.",
+            in_grad.dims()[i], out_grad.dims()[i], in_grad.dims()[i],
+            out_grad.dims()[i]));
   }
 
   // call functor
diff --git a/paddle/fluid/operators/math/tree2col.cc b/paddle/fluid/operators/math/tree2col.cc
index cafcf6319326cc9a496de8ee6aa1033e1320f4b0..0344226ea66e2a64fce0574cb45a8c2ce918359c 100644
--- a/paddle/fluid/operators/math/tree2col.cc
+++ b/paddle/fluid/operators/math/tree2col.cc
@@ -55,7 +55,11 @@ void Tree2ColUtil::construct_tree(const paddle::Tensor &EdgeSet,
                                   std::vector<std::vector<int>> *tr,
                                   size_t *node_count) {
   auto edge_set_dims = EdgeSet.dims();
-  PADDLE_ENFORCE_EQ(edge_set_dims[1], 2);
+  PADDLE_ENFORCE_EQ(edge_set_dims[1], 2,
+                    platform::errors::InvalidArgument(
+                        "The second dimension of the EdgeSet shall be 2, but "
+                        "got %ld != 2. Please check the input value.",
+                        edge_set_dims[1]));
   int64_t edge_count = EdgeSet.numel();
 
   const int *edge_data = EdgeSet.data<int>();
diff --git a/paddle/fluid/operators/math/unpooling.cc b/paddle/fluid/operators/math/unpooling.cc
index 13f0845bb8579615381c06072eb1e32507ffd3cf..9ad2ec5005203d02032aaa6f7e48b27f52631059 100644
--- a/paddle/fluid/operators/math/unpooling.cc
+++ b/paddle/fluid/operators/math/unpooling.cc
@@ -37,7 +37,13 @@ class Unpool2dMaxFunctor<platform::CPUDeviceContext, T> {
       for (int c = 0; c < output_channels; ++c) {
         for (int i = 0; i < input_feasize; ++i) {
           int index = indices_data[i];
-          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          PADDLE_ENFORCE_LT(
+              index, output_feasize,
+              platform::errors::InvalidArgument(
+                  "index should less than output tensor height * output tensor "
+                  "width. Expected %ld < %ld, but got "
+                  "%ld >= %ld. Please check input value.",
+                  index, output_feasize, index, output_feasize));
           output_data[index] = input_data[i];
         }
         input_data += input_feasize;
@@ -72,7 +78,13 @@ class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, T> {
       for (int c = 0; c < output_channels; ++c) {
         for (int i = 0; i < input_feasize; ++i) {
           int index = indices_data[i];
-          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          PADDLE_ENFORCE_LT(
+              index, output_feasize,
+              platform::errors::InvalidArgument(
+                  "index should less than output tensor height * output tensor "
+                  "width. Expected %ld < %ld, but got "
+                  "%ld >= %ld. Please check input value.",
+                  index, output_feasize, index, output_feasize));
           input_grad_data[i] = output_grad_data[index];
         }
         input_grad_data += input_feasize;
diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc
index 01f50727b442579fa62059560d0c75d329d6e288..c05da0062f2bab66746feb9d8ebedeca0c0f9688 100644
--- a/paddle/fluid/operators/math/vol2col.cc
+++ b/paddle/fluid/operators/math/vol2col.cc
@@ -34,10 +34,16 @@ class Vol2ColFunctor<platform::CPUDeviceContext, T> {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, framework::Tensor* col,
                   const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(vol.dims().size(), 4,
-                      "The dimension of vol should be 4.");
-    PADDLE_ENFORCE_EQ(col->dims().size(), 7,
-                      "The dimension of col should be 7.");
+    PADDLE_ENFORCE_EQ(
+        vol.dims().size(), 4,
+        platform::errors::InvalidArgument("The dimension of"
+                                          " vol should be 4, but received %d.",
+                                          vol.dims().size()));
+    PADDLE_ENFORCE_EQ(
+        col->dims().size(), 7,
+        platform::errors::InvalidArgument("The dimension of"
+                                          "col should be 7, but received %d.",
+                                          col->dims().size()));
 
     int input_channels =
         (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
@@ -65,27 +71,33 @@ class Vol2ColFunctor<platform::CPUDeviceContext, T> {
     int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
     int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
 
-    PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
+    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
+                            ((dilations[0] * (filter_depth - 1) + 1))) /
+                               strides[0] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_depth_tmp, output_depth,
+        platform::errors::InvalidArgument(
+            "input_depth(%d) and output_depth(%d) are mismatching.",
+            input_depth_tmp, output_depth));
+    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
+                             ((dilations[1] * (filter_height - 1) + 1))) /
+                                strides[1] +
+                            1;
+    PADDLE_ENFORCE_EQ(
+        input_height_tmp, output_height,
+        platform::errors::InvalidArgument(
+            "input_height(%d) and output_height(%d) are mismatching.",
+            input_height_tmp, output_height));
+    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
+                            ((dilations[2] * (filter_width - 1) + 1))) /
+                               strides[2] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_width_tmp, output_width,
+        platform::errors::InvalidArgument(
+            "input_width(%d) and output_width(%d) are mismatching.",
+            input_width_tmp, output_width));
     const T* vol_data = vol.data<T>();
     T* col_data = col->data<T>();
 
@@ -140,10 +152,16 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, framework::Tensor* vol,
                   const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(vol->dims().size(), 4,
-                      "The dimension of vol should be 4.");
-    PADDLE_ENFORCE_EQ(col.dims().size(), 7,
-                      "The dimension of col should be 7.");
+    PADDLE_ENFORCE_EQ(
+        vol->dims().size(), 4,
+        platform::errors::InvalidArgument("The dimension of vol"
+                                          " should be 4, but received %d.",
+                                          vol->dims().size()));
+    PADDLE_ENFORCE_EQ(
+        col.dims().size(), 7,
+        platform::errors::InvalidArgument("The dimension of col"
+                                          " should be 7, but received %d.",
+                                          col.dims().size()));
 
     int input_channels =
         (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
@@ -170,27 +188,33 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> {
     int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
     int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
 
-    PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
+    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
+                            ((dilations[0] * (filter_depth - 1) + 1))) /
+                               strides[0] +
+                           1;
+    PADDLE_ENFORCE_EQ(input_depth_tmp, output_depth,
+                      platform::errors::InvalidArgument(
+                          "input_depth(%d)"
+                          " and output_depth(%d) are mismatching.",
+                          input_depth_tmp, output_depth));
+    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
+                             ((dilations[1] * (filter_height - 1) + 1))) /
+                                strides[1] +
+                            1;
+    PADDLE_ENFORCE_EQ(input_height_tmp, output_height,
+                      platform::errors::InvalidArgument(
+                          "input_height(%d)"
+                          " and output_height(%d) are mismatching.",
+                          input_height_tmp, output_height));
+    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
+                            ((dilations[2] * (filter_width - 1) + 1))) /
+                               strides[2] +
+                           1;
+    PADDLE_ENFORCE_EQ(input_width_tmp, output_width,
+                      platform::errors::InvalidArgument(
+                          "input_width(%d)"
+                          " and output_width(%d) are mismatching.",
+                          input_width_tmp, output_width));
     T* vol_data = vol->data<T>();
     const T* col_data = col.data<T>();
 
diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu
index 9de9051f512348f2567bfc35ae775b1852ed25fc..fe5a600909893b8313d470923ef4d43eae155e76 100644
--- a/paddle/fluid/operators/math/vol2col.cu
+++ b/paddle/fluid/operators/math/vol2col.cu
@@ -90,10 +90,16 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, framework::Tensor* col,
                   const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(vol.dims().size(), 4,
-                      "The dimension of vol should be 4.");
-    PADDLE_ENFORCE_EQ(col->dims().size(), 7,
-                      "The dimension of col should be 7.");
+    PADDLE_ENFORCE_EQ(
+        vol.dims().size(), 4,
+        platform::errors::InvalidArgument("The dimension of"
+                                          " vol should be 4, but received %d.",
+                                          vol.dims().size()));
+    PADDLE_ENFORCE_EQ(
+        col->dims().size(), 7,
+        platform::errors::InvalidArgument("The dimension of"
+                                          "col should be 7, but received %d.",
+                                          col->dims().size()));
 
     int input_channels =
         (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
@@ -117,27 +123,33 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> {
     int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
     int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
     int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
-    PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
+    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
+                            ((dilations[0] * (filter_depth - 1) + 1))) /
+                               strides[0] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_depth_tmp, output_depth,
+        platform::errors::InvalidArgument(
+            "input_depth(%d) and output_depth(%d) are mismatching.",
+            input_depth_tmp, output_depth));
+    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
+                             ((dilations[1] * (filter_height - 1) + 1))) /
+                                strides[1] +
+                            1;
+    PADDLE_ENFORCE_EQ(
+        input_height_tmp, output_height,
+        platform::errors::InvalidArgument(
+            "input_height(%d) and output_height(%d) are mismatching.",
+            input_height_tmp, output_height));
+    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
+                            ((dilations[2] * (filter_width - 1) + 1))) /
+                               strides[2] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_width_tmp, output_width,
+        platform::errors::InvalidArgument(
+            "input_width(%d) and output_width(%d) are mismatching.",
+            input_width_tmp, output_width));
 
     int num_outputs =
         input_channels * output_depth * output_height * output_width;
@@ -241,10 +253,16 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings, framework::Tensor* vol,
                   const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(vol->dims().size(), 4,
-                      "The dimension of vol should be 4.");
-    PADDLE_ENFORCE_EQ(col.dims().size(), 7,
-                      "The dimension of col should be 7.");
+    PADDLE_ENFORCE_EQ(
+        vol->dims().size(), 4,
+        platform::errors::InvalidArgument("The dimension of vol"
+                                          " should be 4, but received %d.",
+                                          vol->dims().size()));
+    PADDLE_ENFORCE_EQ(
+        col.dims().size(), 7,
+        platform::errors::InvalidArgument("The dimension of col"
+                                          " should be 7, but received %d.",
+                                          col.dims().size()));
 
     int input_channels =
         (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
@@ -269,27 +287,33 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
     int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
     int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
 
-    PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
+    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
+                            ((dilations[0] * (filter_depth - 1) + 1))) /
+                               strides[0] +
+                           1;
+    PADDLE_ENFORCE_EQ(input_depth_tmp, output_depth,
+                      platform::errors::InvalidArgument(
+                          "input_depth(%d)"
+                          " and output_depth(%d) are mismatching.",
+                          input_depth_tmp, output_depth));
+    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
+                             ((dilations[1] * (filter_height - 1) + 1))) /
+                                strides[1] +
+                            1;
+    PADDLE_ENFORCE_EQ(input_height_tmp, output_height,
+                      platform::errors::InvalidArgument(
+                          "input_height(%d)"
+                          " and output_height(%d) are mismatching.",
+                          input_height_tmp, output_height));
+    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
+                            ((dilations[2] * (filter_width - 1) + 1))) /
+                               strides[2] +
+                           1;
+    PADDLE_ENFORCE_EQ(input_width_tmp, output_width,
+                      platform::errors::InvalidArgument(
+                          "input_width(%d)"
+                          " and output_width(%d) are mismatching.",
+                          input_width_tmp, output_width));
 
     int num_kernels = input_channels * input_depth * input_height * input_width;
 
diff --git a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
index 98200caca8cf66960632b88966f23e99fcd4c299..51fa5ad021a2b284cd75f297d83326b2102c1e41 100644
--- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
@@ -30,8 +30,7 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
     float std = context.Attr<float>("std");
     auto* tensor = context.Output<framework::Tensor>("Out");
 
-    const std::string op_type = "gaussian_random";
-    auto shape = GetShape(context, op_type);
+    auto shape = GetShape(context);
     tensor->Resize(shape);
     T* data = tensor->mutable_data<T>(context.GetPlace());
     int64_t size = tensor->numel();
diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
index 29a86a35d7b26f41745907fb6bacf30506c027a0..a6c8f8656a4e252f1a1eedb6d67ca322f0747a66 100644
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
@@ -48,6 +48,7 @@ class QuantOpKernel : public framework::OpKernel<T> {
     const T* input_data = input->data<T>();
 
     bool is_negative = ctx.Attr<bool>("is_negative_input");
+    bool bfloat16 = ctx.Attr<bool>("bfloat16");
     std::string key =
         platform::CreateKey(platform::ThreadIDasStr(), src_tz, scale_data,
                             is_negative, ctx.OutputName("Output"));
@@ -74,7 +75,10 @@ class QuantOpKernel : public framework::OpKernel<T> {
           src_md, engine, to_void_cast<T>(input_data));
 
       std::shared_ptr<mkldnn::memory::desc> dst_md;
-      if (is_negative) {
+      if (bfloat16) {
+        platform::SetDstMemoryQuantized<paddle::platform::bfloat16>(
+            ctx, output, dst_tz, engine, dst_md, dst_memory, out_format);
+      } else if (is_negative) {
         platform::SetDstMemoryQuantized<int8_t>(ctx, output, dst_tz, engine,
                                                 dst_md, dst_memory, out_format);
       } else {
@@ -96,7 +100,11 @@ class QuantOpKernel : public framework::OpKernel<T> {
       dst_memory = std::static_pointer_cast<mkldnn::memory>(
           dev_ctx.GetBlob(key_dst_mem));
       auto place = ctx.GetPlace();
-      if (is_negative) {
+
+      if (bfloat16) {
+        dst_memory->set_data_handle(
+            output->mutable_data<paddle::platform::bfloat16>(place));
+      } else if (is_negative) {
         dst_memory->set_data_handle(output->mutable_data<int8_t>(place));
       } else {
         dst_memory->set_data_handle(output->mutable_data<uint8_t>(place));
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cc b/paddle/fluid/operators/optimizers/lars_momentum_op.cc
old mode 100644
new mode 100755
index 5f0500d2faa77f7c2e901c0d30ab2c42036d2a86..479f9643749d63c673158ad055409a0925f3d576
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cc
@@ -48,6 +48,9 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<float>("lars_weight_decay",
                    "(float, default 0.0005) LARS weight decay")
         .SetDefault(0.0005);
+    AddAttr<float>("epsilon",
+                   "(float, default 0.0) epsilon to avoid Division by Zero.")
+        .SetDefault(0.0);
 
     AddComment(R"DOC(
 Lars Momentum Optimizer.
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
index 1dace4ed6ab3e17b348035e34f6d9ea6d31edae9..eb0111ae4de2f066359e26406f6c7ec3eb54d5fc 100644
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
@@ -23,14 +23,16 @@ __global__ void MomentumLarsKernel(const T* p, const T* g, const T* v,
                                    const T* learning_rate, const T mu,
                                    const int64_t num, const T lars_coeff,
                                    const T lars_weight_decay, const T* p_norm,
-                                   const T* g_norm, T* p_out, T* v_out) {
+                                   const T* g_norm, T* p_out, T* v_out,
+                                   const T epsilon) {
   T lr = learning_rate[0];
   T local_lr = learning_rate[0];
   CUDA_KERNEL_LOOP(i, num) {
-    if (p_norm[0] > 0 && g_norm[0] > 0) {
+    if (lars_weight_decay > 0 && p_norm[0] > 0 && g_norm[0] > 0) {
       local_lr = lr * lars_coeff * p_norm[0] /
-                 (g_norm[0] + lars_weight_decay * p_norm[0]);
+                 (g_norm[0] + lars_weight_decay * p_norm[0] + epsilon);
     }
+
     T v_new = v[i] * mu + local_lr * (g[i] + lars_weight_decay * p[i]);
     v_out[i] = v_new;
     p_out[i] = p[i] - v_new;
@@ -54,6 +56,7 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
     T mu = static_cast<T>(ctx.Attr<float>("mu"));
     T lars_coeff = ctx.Attr<float>("lars_coeff");
     T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
+    T epsilon = ctx.Attr<float>("epsilon");
 
     auto* p = param->data<T>();
     auto* v = velocity->data<T>();
@@ -79,7 +82,7 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
     eg_norm.device(*place) = eigen_g.square().sum().sqrt();
     MomentumLarsKernel<<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
         p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay,
-        p_norm_data, g_norm_data, p_out, v_out);
+        p_norm_data, g_norm_data, p_out, v_out, epsilon);
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.h b/paddle/fluid/operators/optimizers/lars_momentum_op.h
old mode 100644
new mode 100755
index e0064c201825b1f074eb53c591dc3abdd7bc1e1b..b579b5143ddbe6221738f9864f13fb7bea4ac509
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.h
@@ -39,6 +39,7 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
     T mu = static_cast<T>(ctx.Attr<float>("mu"));
     T lars_coeff = ctx.Attr<float>("lars_coeff");
     T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
+    T epsilon = ctx.Attr<float>("epsilon");
 
     auto p_out = framework::EigenVector<T>::Flatten(*param_out);
     auto v_out = framework::EigenVector<T>::Flatten(*velocity_out);
@@ -59,9 +60,9 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
     ep_norm = p.square().sum().sqrt();
     eg_norm = g.square().sum().sqrt();
     T local_lr = lr[0];
-    if (ep_norm(0) > 0 && eg_norm(0) > 0) {
+    if (lars_weight_decay > 0 && ep_norm(0) > 0 && eg_norm(0) > 0) {
       local_lr = lr[0] * lars_coeff * ep_norm(0) /
-                 (eg_norm(0) + lars_weight_decay * ep_norm(0));
+                 (eg_norm(0) + lars_weight_decay * ep_norm(0) + epsilon);
     }
     v_out = v * mu + local_lr * (g + lars_weight_decay * p);
     p_out = p - v_out;
diff --git a/paddle/fluid/operators/p_norm_op.cc b/paddle/fluid/operators/p_norm_op.cc
index 59035d5a8ca5d4214f1370e1b14b2be9b234fa6a..cd7a8c6d24eaaca096b630b74dd7cd1bb9d35d09 100644
--- a/paddle/fluid/operators/p_norm_op.cc
+++ b/paddle/fluid/operators/p_norm_op.cc
@@ -105,6 +105,12 @@ class PnormOp : public framework::OperatorWithKernel {
     bool asvector = ctx->Attrs().Get<bool>("asvector");
     if (asvector) {
       reduce_dims.emplace_back(1);
+      if (keepdim) {
+        for (int i = 1; i < x_dim.size(); ++i) {
+          reduce_dims.emplace_back(1);
+        }
+        x_dim = framework::make_ddim(reduce_dims);
+      }
     } else {
       if (axis < 0) axis = x_dim.size() + axis;
       for (int i = 0; i < x_dim.size(); ++i) {
diff --git a/paddle/fluid/operators/quantize_op.cc b/paddle/fluid/operators/quantize_op.cc
index 8924e21b46f49b0fd0ec72e6acc7463d7d574d6f..602fdc6ff67787ace488379a2730dad4b8ffe1b1 100644
--- a/paddle/fluid/operators/quantize_op.cc
+++ b/paddle/fluid/operators/quantize_op.cc
@@ -40,6 +40,8 @@ void QuantOpMaker::Make() {
   AddAttr<std::string>("output_format",
                        "Convert format to NHWC or NCHW during quantization.")
       .SetDefault("NHWC");
+  AddAttr<bool>("bfloat16", "(bool, default false) Convert to bfloat16")
+      .SetDefault(false);
   AddComment(R"DOC(This op will quantize data from FP32 to INT8)DOC");
 }
 
diff --git a/paddle/fluid/operators/reduce_ops/CMakeLists.txt b/paddle/fluid/operators/reduce_ops/CMakeLists.txt
index 3da481a142aa2282aade661de7679cf4edf597a0..a68666b100cb52c722c4fefc849e94947130010f 100644
--- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/reduce_ops/CMakeLists.txt
@@ -1,6 +1,10 @@
 include(operators)
 if(WITH_GPU)
-    register_operators(DEPS cub)
+    if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+        register_operators(DEPS cub)
+    else()
+        register_operators()
+    endif()
 else()
     register_operators()
 endif()
@@ -24,5 +28,9 @@ if(WITH_GPU)
 endif()
 
 if(WITH_GPU)
-    nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor cub)
+    if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+	nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor cub)
+    else()
+	nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor)
+    endif()
 endif()
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
index 322a1637f5deec909db13f1bd0433446cd7606ae..7cd164bfd3a3d77288b59c40f147ae9cdd8215e0 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
@@ -13,18 +13,138 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
-#include <memory>
+#include <algorithm>
 #include <string>
-#include <utility>
 #include <vector>
 
 namespace paddle {
 namespace operators {
 
-class LogsumexpOpMaker : public ops::ReduceOpMaker {
- protected:
-  virtual std::string GetName() const { return "logsumexp"; }
-  virtual std::string GetOpType() const { return "Reduce logsumexp"; }
+class LogsumexpOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "logsumexp");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "logsumexp");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_rank = x_dims.size();
+    PADDLE_ENFORCE_LE(x_rank, 4,
+                      platform::errors::InvalidArgument(
+                          "The input tensor X's dimensions of logsumexp "
+                          "should be less equal than 4. But received X's "
+                          "dimensions = %d, X's shape = [%s].",
+                          x_rank, x_dims));
+    auto axis = ctx->Attrs().Get<std::vector<int>>("axis");
+    PADDLE_ENFORCE_GT(
+        axis.size(), 0,
+        platform::errors::InvalidArgument(
+            "The size of axis of logsumexp "
+            "should be greater than 0. But received the size of axis "
+            "of logsumexp is %d.",
+            axis.size()));
+
+    for (size_t i = 0; i < axis.size(); i++) {
+      PADDLE_ENFORCE_LT(
+          axis[i], x_rank,
+          platform::errors::InvalidArgument(
+              "axis[%d] should be in the "
+              "range [-dimension(X), dimension(X)] "
+              "where dimesion(X) is %d. But received axis[i] = %d.",
+              i, x_rank, axis[i]));
+      PADDLE_ENFORCE_GE(
+          axis[i], -x_rank,
+          platform::errors::InvalidArgument(
+              "axis[%d] should be in the "
+              "range [-dimension(X), dimension(X)] "
+              "where dimesion(X) is %d. But received axis[i] = %d.",
+              i, x_rank, axis[i]));
+      if (axis[i] < 0) {
+        axis[i] += x_rank;
+      }
+    }
+
+    bool keepdim = ctx->Attrs().Get<bool>("keepdim");
+    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
+    auto dims_vector = vectorize(x_dims);
+    if (reduce_all) {
+      if (keepdim)
+        ctx->SetOutputDim(
+            "Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
+      else
+        ctx->SetOutputDim("Out", {1});
+    } else {
+      auto dims_vector = vectorize(x_dims);
+      if (keepdim) {
+        for (size_t i = 0; i < axis.size(); ++i) {
+          dims_vector[axis[i]] = 1;
+        }
+      } else {
+        const int kDelFlag = -1;
+        for (size_t i = 0; i < axis.size(); ++i) {
+          dims_vector[axis[i]] = kDelFlag;
+        }
+        dims_vector.erase(
+            std::remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
+            dims_vector.end());
+      }
+      if (!keepdim && dims_vector.size() == 0) {
+        dims_vector.push_back(1);
+      }
+      auto out_dims = framework::make_ddim(dims_vector);
+      ctx->SetOutputDim("Out", out_dims);
+      if (axis.size() > 0 && axis[0] != 0) {
+        // Only pass LoD when not reducing on the first dim.
+        ctx->ShareLoD("X", /*->*/ "Out");
+      }
+    }
+  }
+};
+
+class LogsumexpOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor) The input tensor. Tensors with rank at most 4 are "
+             "supported.");
+    AddOutput("Out", "(Tensor) The result tensor.");
+    AddAttr<std::vector<int>>(
+        "axis",
+        "(list<int>, default {0}) The dimensions to reduce. "
+        "Must be in the range [-rank(input), rank(input)). "
+        "If `axis[i] < 0`, the axis[i] to reduce is `rank + axis[i]`. "
+        "Note that reducing on the first dim will make the LoD info lost.")
+        .SetDefault({0});
+    AddAttr<bool>("keepdim",
+                  "(bool, default false) "
+                  "If true, retain the reduced dimension with length 1.")
+        .SetDefault(false);
+    AddAttr<bool>("reduce_all",
+                  "(bool, default false) "
+                  "If true, output a scalar reduced along all dimensions.")
+        .SetDefault(false);
+    AddComment(string::Sprintf(R"DOC(
+logsumexp Operator.
+
+This operator computes the logsumexp of input tensor along the given axis.
+The result tensor has 1 fewer dimension than the input unless keep_dim is true.
+If reduce_all is true, just reduce along all dimensions and output a scalar.
+
+)DOC"));
+  }
+};
+
+class LogsumexpGrapOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "logsumexp");
+    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "logsumexp");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@GRAD", "logsumexp");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
 };
 
 template <typename T>
@@ -32,7 +152,6 @@ class LogsumexpGradOpMaker : public framework::SingleGradOpMaker<T> {
  public:
   using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
 
- protected:
   void Apply(GradOpPtr<T> op) const override {
     op->SetType("logsumexp_grad");
     op->SetInput("X", this->Input("X"));
@@ -46,18 +165,17 @@ class LogsumexpGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OPERATOR(logsumexp, ops::ReduceOp, ops::LogsumexpOpMaker,
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(logsumexp, ops::LogsumexpOp, ops::LogsumexpOpMaker,
                   ops::LogsumexpGradOpMaker<paddle::framework::OpDesc>,
                   ops::LogsumexpGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(logsumexp_grad, ops::ReduceGradOp);
+REGISTER_OPERATOR(logsumexp_grad, ops::LogsumexpGrapOp);
 
-REGISTER_OP_CPU_KERNEL(logsumexp,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         float, ops::LogsumexpFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         double, ops::LogsumexpFunctor>);
 REGISTER_OP_CPU_KERNEL(
-    logsumexp_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                          float, ops::LogsumexpGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
-                          ops::LogsumexpGradFunctor>);
+    logsumexp, ops::LogsumexpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LogsumexpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    logsumexp_grad,
+    ops::LogsumexpGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LogsumexpGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.cu b/paddle/fluid/operators/reduce_ops/logsumexp_op.cu
index c9ad1075c0c3c1c6f405144dbfde2e81b85124aa..86a31595ebaabcbc07fab64779c33566d5b020eb 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.cu
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cu
@@ -14,8 +14,8 @@
 
 #include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
 
-REGISTER_OP_CUDA_KERNEL(logsumexp,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::LogsumexpFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          double, ops::LogsumexpFunctor>);
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    logsumexp, ops::LogsumexpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LogsumexpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.h b/paddle/fluid/operators/reduce_ops/logsumexp_op.h
index 1d0e00262a37ff7160abd7a865e63377f8b30461..a478690976bd396db921b465d171a422451e0742 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.h
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.h
@@ -14,11 +14,20 @@
 
 #pragma once
 
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
 
 namespace paddle {
 namespace operators {
 
+#define HANDLE_DIM(NDIM, RDIM)                                            \
+  if (ndim == NDIM && rdim == RDIM) {                                     \
+    ReduceFunctor<DeviceContext, OutT, NDIM, RDIM, LogsumexpFunctor>(     \
+        context.template device_context<DeviceContext>(), *input, output, \
+        axis, keepdim);                                                   \
+  }
+
 struct LogsumexpFunctor {
   template <typename DeviceContext, typename X, typename Y, typename Dim>
   void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
@@ -54,5 +63,106 @@ struct LogsumexpGradFunctor {
   }
 };
 
+template <typename DeviceContext, typename OutT>
+class LogsumexpKernel : public framework::OpKernel<OutT> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<OutT>(context.GetPlace());
+
+    auto axis = context.Attr<std::vector<int>>("axis");
+    auto keepdim = context.Attr<bool>("keepdim");
+    auto reduce_all = context.Attr<bool>("reduce_all");
+
+    const auto& input_dim_size = input->dims().size();
+    // The dims has full dim, set the reduce_all is True
+    reduce_all |= (static_cast<const int>(axis.size()) == input_dim_size);
+
+    if (reduce_all) {
+      // Flatten and reduce 1-D tensor
+      auto x = EigenVector<OutT>::Flatten(*input);
+      auto out = EigenScalar<OutT>::From(*output);
+      auto& place =
+          *context.template device_context<DeviceContext>().eigen_device();
+      auto reduce_dim = Eigen::array<int, 1>({{0}});
+      LogsumexpFunctor()(place, &x, &out, reduce_dim);
+    } else {
+      int ndim = input_dim_size;
+      int rdim = axis.size();
+      // comments for accelerating compiling temporarily.
+      // HANDLE_DIM(6, 5);
+      // HANDLE_DIM(6, 4);
+      // HANDLE_DIM(6, 3);
+      // HANDLE_DIM(6, 2);
+      // HANDLE_DIM(6, 1);
+      // HANDLE_DIM(5, 4);
+      // HANDLE_DIM(5, 3);
+      // HANDLE_DIM(5, 2);
+      // HANDLE_DIM(5, 1);
+      HANDLE_DIM(4, 3);
+      HANDLE_DIM(4, 2);
+      HANDLE_DIM(4, 1);
+      HANDLE_DIM(3, 2);
+      HANDLE_DIM(3, 1);
+      HANDLE_DIM(2, 1);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LogsumexpGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Input<Tensor>("Out");
+    auto* output_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* input_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    input_grad->mutable_data<T>(context.GetPlace());
+
+    auto axis = context.Attr<std::vector<int>>("axis");
+    auto reduce_all = context.Attr<bool>("reduce_all");
+    const auto input_dim_size = context.Input<Tensor>("X")->dims().size();
+    reduce_all |= (static_cast<const int>(axis.size()) == input_dim_size);
+
+    if (reduce_all) {
+      auto x = EigenVector<T>::Flatten(*input);
+      auto y = EigenVector<T>::Flatten(*output);
+      auto dy = EigenVector<T>::Flatten(*output_grad);
+      auto dx = EigenVector<T>::Flatten(*input_grad);
+      auto& place =
+          *context.template device_context<DeviceContext>().eigen_device();
+      auto broadcast_dim =
+          Eigen::array<int, 1>({{static_cast<int>(input->numel())}});
+      LogsumexpGradFunctor()(place, &x, &y, &dx, &dy, broadcast_dim,
+                             broadcast_dim[0]);
+    } else {
+      int rank = input->dims().size();
+      switch (rank) {
+        case 1:
+          ReduceGradFunctor<DeviceContext, T, 1, LogsumexpGradFunctor>(
+              context.template device_context<DeviceContext>(), *input, *output,
+              *output_grad, input_grad, axis);
+          break;
+        case 2:
+          ReduceGradFunctor<DeviceContext, T, 2, LogsumexpGradFunctor>(
+              context.template device_context<DeviceContext>(), *input, *output,
+              *output_grad, input_grad, axis);
+          break;
+        case 3:
+          ReduceGradFunctor<DeviceContext, T, 3, LogsumexpGradFunctor>(
+              context.template device_context<DeviceContext>(), *input, *output,
+              *output_grad, input_grad, axis);
+          break;
+        case 4:
+          ReduceGradFunctor<DeviceContext, T, 4, LogsumexpGradFunctor>(
+              context.template device_context<DeviceContext>(), *input, *output,
+              *output_grad, input_grad, axis);
+          break;
+      }
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.part.cu b/paddle/fluid/operators/reduce_ops/logsumexp_op.part.cu
index d6ad4863092a50233b806c944db0b8c161ed9dd0..81124e4f070a54444f4305dc903280548ac10b60 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.part.cu
@@ -15,8 +15,9 @@
 // .part used to speed up nvcc compile
 #include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
 
+namespace ops = paddle::operators;
+
 REGISTER_OP_CUDA_KERNEL(
-    logsumexp_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::LogsumexpGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::LogsumexpGradFunctor>);
+    logsumexp_grad,
+    ops::LogsumexpGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LogsumexpGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index 6e470e3af4e5860796d898a5e3138c28958264cb..54818470b277443e411ea5f7d9c7561eddc7046a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -51,6 +51,20 @@ class ReduceSumOpGradMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
+template <typename T>
+class ReduceSumDoubleOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    op->SetAttrMap(this->Attrs());
+    op->SetType("reduce_sum");
+  }
+};
+
 DECLARE_NO_NEED_BUFFER_VARS_INFERER(ReduceSumGradNoNeedBufferVarInferer, "X");
 class ReduceSumVarTypeInference : public paddle::framework::VarTypeInference {
  public:
@@ -77,6 +91,8 @@ REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ReduceSumOpMaker,
                   ops::ReduceSumOpGradMaker<paddle::framework::OpDesc>,
                   ops::ReduceSumOpGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp,
+                  ops::ReduceSumDoubleOpGradMaker<paddle::framework::OpDesc>,
+                  ops::ReduceSumDoubleOpGradMaker<paddle::imperative::OpBase>,
                   ops::ReduceSumGradNoNeedBufferVarInferer);
 
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
index 911dfea50e2e2cf8ec8f230bfc1e0bf4836463b6..0eeb7e0bb24f512aa6859e92de9f490e491543aa 100644
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/roi_align_op.h"
 #include <memory>
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -35,13 +36,13 @@ class ROIAlignOp : public framework::OperatorWithKernel {
     auto input_dims = ctx->GetInputDim("X");
     auto rois_dims = ctx->GetInputDim("ROIs");
 
-    if (ctx->HasInput("RoisLod")) {
-      auto rois_lod_dims = ctx->GetInputDim("RoisLod");
+    if (ctx->HasInput("RoisNum")) {
+      auto rois_num_dims = ctx->GetInputDim("RoisNum");
       PADDLE_ENFORCE_EQ(
-          rois_lod_dims.size(), 1,
-          platform::errors::InvalidArgument("The RoisLod dimension should be 1"
-                                            ", but got dimension = %d",
-                                            rois_lod_dims.size()));
+          rois_num_dims.size(), 1,
+          platform::errors::InvalidArgument("The size of RoisNum should be 1"
+                                            ", but received size = %d",
+                                            rois_num_dims.size()));
     }
     PADDLE_ENFORCE_EQ(
         input_dims.size(), 4,
@@ -145,9 +146,9 @@ class ROIAlignOpMaker : public framework::OpProtoAndCheckerMaker {
              "given as [[x1, y1, x2, y2], ...]. "
              "(x1, y1) is the top left coordinates, and "
              "(x2, y2) is the bottom right coordinates.");
-    AddInput("RoisLod",
+    AddInput("RoisNum",
              "(Tensor), "
-             "The lod info of rois.")
+             "The number of RoIs in each image.")
         .AsDispensable();
     AddOutput("Out",
               "(Tensor), "
@@ -203,7 +204,7 @@ class ROIAlignGradMaker : public framework::SingleGradOpMaker<T> {
     op->SetType("roi_align_grad");
     op->SetInput("X", this->Input("X"));
     op->SetInput("ROIs", this->Input("ROIs"));
-    op->SetInput("RoisLod", this->Input("RoisLod"));
+    op->SetInput("RoisNum", this->Input("RoisNum"));
     op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
     op->SetAttrMap(this->Attrs());
@@ -231,3 +232,10 @@ REGISTER_OP_CPU_KERNEL(
     ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, float>,
     ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, double>,
     ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, int>);
+REGISTER_OP_VERSION(roi_align)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade roi_align add a new input [RoisNum])ROC",
+        paddle::framework::compatible::OpVersionDesc().NewInput(
+            "RoisNum",
+            "The number of RoIs in each image. RoisNum is dispensable."));
diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu
index f7ec13e5bccd63d2f6552ed52f8d709a57320ddd..3a4ce55f4fb77160e7fc645539c1868fe2864b19 100644
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
@@ -257,24 +257,26 @@ class GPUROIAlignOpKernel : public framework::OpKernel<T> {
     int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
     auto& dev_ctx = ctx.cuda_device_context();
     auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-    if (ctx.HasInput("RoisLod")) {
-      auto* rois_lod = ctx.Input<Tensor>("RoisLod");
-      int rois_batch_size = rois_lod->numel();
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
+      int rois_batch_size = rois_num_t->numel();
       PADDLE_ENFORCE_EQ(
-          rois_batch_size - 1, batch_size,
+          rois_batch_size, batch_size,
           platform::errors::InvalidArgument(
               "The rois_batch_size and imgs "
               "batch_size must be the same. But received rois_batch_size = %d, "
               "batch_size = %d",
               rois_batch_size, batch_size));
 
-      std::vector<int64_t> rois_lod_(rois_batch_size);
-      memory::Copy(cplace, rois_lod_.data(), gplace, rois_lod->data<int64_t>(),
-                   sizeof(int64_t) * rois_batch_size, 0);
-      for (int n = 0; n < rois_batch_size - 1; ++n) {
-        for (size_t i = rois_lod_[n]; i < rois_lod_[n + 1]; ++i) {
+      std::vector<int> rois_num_list(rois_batch_size);
+      memory::Copy(cplace, rois_num_list.data(), gplace,
+                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_list[n]; ++i) {
           roi_batch_id_data[i] = n;
         }
+        start += rois_num_list[n];
       }
     } else {
       auto lod = rois->lod();
@@ -348,16 +350,18 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx = ctx.cuda_device_context();
     auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-    if (ctx.HasInput("RoisLod")) {
-      auto* rois_lod = ctx.Input<Tensor>("RoisLod");
-      int rois_batch_size = rois_lod->numel();
-      std::vector<int64_t> rois_lod_(rois_batch_size);
-      memory::Copy(cplace, rois_lod_.data(), gplace, rois_lod->data<int64_t>(),
-                   sizeof(int64_t) * rois_batch_size, 0);
-      for (int n = 0; n < rois_batch_size - 1; ++n) {
-        for (size_t i = rois_lod_[n]; i < rois_lod_[n + 1]; ++i) {
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
+      int rois_batch_size = rois_num_t->numel();
+      std::vector<int> rois_num_list(rois_batch_size);
+      memory::Copy(cplace, rois_num_list.data(), gplace,
+                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = start; i < start + rois_num_list[n]; ++i) {
           roi_batch_id_data[i] = n;
         }
+        start += rois_num_list[n];
       }
     } else {
       auto rois_lod = rois->lod().back();
diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h
index 366f865411461c91b4ec88203390b15fdba4414c..066125a92fbd9d1d49f0ba023366865620674e1f 100644
--- a/paddle/fluid/operators/roi_align_op.h
+++ b/paddle/fluid/operators/roi_align_op.h
@@ -165,21 +165,23 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
     int* roi_batch_id_data =
         roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
     int rois_batch_size;
-    if (ctx.HasInput("RoisLod")) {
-      auto* rois_lod_t = ctx.Input<framework::Tensor>("RoisLod");
-      rois_batch_size = rois_lod_t->numel();
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
+      rois_batch_size = rois_num_t->numel();
       PADDLE_ENFORCE_EQ(
-          rois_batch_size - 1, batch_size,
+          rois_batch_size, batch_size,
           platform::errors::InvalidArgument(
               "The batch size of rois and the batch size of images "
               " must be the same. But received the batch size of rois is %d, "
               "and the batch size of images is %d",
               rois_batch_size, batch_size));
-      auto* rois_lod = rois_lod_t->data<int64_t>();
-      for (int n = 0; n < rois_batch_size - 1; ++n) {
-        for (int i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+      auto* rois_num_data = rois_num_t->data<int>();
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_data[n]; ++i) {
           roi_batch_id_data[i] = n;
         }
+        start += rois_num_data[n];
       }
     } else {
       auto lod = rois->lod();
@@ -303,14 +305,16 @@ class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
         roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
 
     int rois_batch_size;
-    if (ctx.HasInput("RoisLod")) {
-      auto* rois_lod_t = ctx.Input<framework::Tensor>("RoisLod");
-      rois_batch_size = rois_lod_t->numel();
-      auto* rois_lod = rois_lod_t->data<int64_t>();
-      for (int n = 0; n < rois_batch_size - 1; ++n) {
-        for (int i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
+      rois_batch_size = rois_num_t->numel();
+      auto* rois_num_data = rois_num_t->data<int>();
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_data[n]; ++i) {
           roi_batch_id_data[i] = n;
         }
+        start += rois_num_data[n];
       }
     } else {
       auto rois_lod = rois->lod().back();
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index 8a34cb35f6bf8dde97d29c02749d8a52fdf5f090..be3187b7513144f583458f3d7902a102e531a981 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/roi_pool_op.h"
 #include <memory>
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -34,12 +35,13 @@ class ROIPoolOp : public framework::OperatorWithKernel {
     auto input_dims = ctx->GetInputDim("X");
     auto rois_dims = ctx->GetInputDim("ROIs");
 
-    if (ctx->HasInput("RoisLod")) {
-      auto rois_lod_dims = ctx->GetInputDim("RoisLod");
-      PADDLE_ENFORCE_EQ(rois_lod_dims.size(), 1,
+    if (ctx->HasInput("RoisNum")) {
+      auto rois_num_dims = ctx->GetInputDim("RoisNum");
+      PADDLE_ENFORCE_EQ(rois_num_dims.size(), 1,
                         platform::errors::InvalidArgument(
-                            "The lod information tensor of ROIs should "
-                            "be one-dimensional"));
+                            "The second dimension of RoisNum should "
+                            "be 1, but received dimension is %d",
+                            rois_num_dims.size()));
     }
     PADDLE_ENFORCE_EQ(input_dims.size(), 4,
                       platform::errors::InvalidArgument(
@@ -140,7 +142,8 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
              "Where batch_id is the id of the data, "
              "(x1, y1) is the top left coordinates, and "
              "(x2, y2) is the bottom right coordinates.");
-    AddInput("RoisLod", "(Tensor), The lod info of rois.").AsDispensable();
+    AddInput("RoisNum", "(Tensor), The number of RoIs in each image.")
+        .AsDispensable();
     AddOutput("Out",
               "(Tensor), "
               "The output of ROIPoolOp is a 4-D tensor with shape "
@@ -197,7 +200,7 @@ class ROIPoolGradMaker : public framework::SingleGradOpMaker<T> {
     op->SetType("roi_pool_grad");
     op->SetInput("X", this->Input("X"));
     op->SetInput("ROIs", this->Input("ROIs"));
-    op->SetInput("RoisLod", this->Input("RoisLod"));
+    op->SetInput("RoisNum", this->Input("RoisNum"));
     op->SetInput("Argmax", this->Output("Argmax"));
     op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
@@ -223,3 +226,10 @@ REGISTER_OP_CPU_KERNEL(
     ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
     ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>,
     ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, int>);
+REGISTER_OP_VERSION(roi_pool)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade roi_pool add a new input [RoisNum])ROC",
+        paddle::framework::compatible::OpVersionDesc().NewInput(
+            "RoisNum",
+            "The number of RoIs in each image. RoisNum is dispensable."));
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
index 1e8a8e3037d84f980d963d359ce791b1ddba47d3..98d9ef6b6e11440d38abbedbfd93f6d3544d77bc 100644
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -157,19 +157,21 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
     int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
     auto& dev_ctx = ctx.cuda_device_context();
     auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-    if (ctx.HasInput("RoisLod")) {
-      auto* rois_lod = ctx.Input<Tensor>("RoisLod");
-      int rois_batch_size = rois_lod->numel();
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
+      int rois_batch_size = rois_num_t->numel();
       PADDLE_ENFORCE_EQ(
-          rois_batch_size - 1, batch_size,
+          rois_batch_size, batch_size,
           "The rois_batch_size and imgs batch_size must be the same.");
-      std::vector<int64_t> rois_lod_(rois_batch_size);
-      memory::Copy(cplace, rois_lod_.data(), gplace, rois_lod->data<int64_t>(),
-                   sizeof(int64_t) * rois_batch_size, 0);
-      for (int n = 0; n < rois_batch_size - 1; ++n) {
-        for (size_t i = rois_lod_[n]; i < rois_lod_[n + 1]; ++i) {
+      std::vector<int> rois_num_list(rois_batch_size);
+      memory::Copy(cplace, rois_num_list.data(), gplace,
+                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_list[n]; ++i) {
           roi_batch_id_data[i] = n;
         }
+        start += rois_num_list[n];
       }
     } else {
       auto rois_lod = rois->lod().back();
@@ -206,7 +208,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in = ctx.Input<Tensor>("X");
     auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* rois_lod = ctx.Input<Tensor>("RoisLod");
+    auto* rois_lod = ctx.Input<Tensor>("RoisNum");
     auto* argmax = ctx.Input<Tensor>("Argmax");
 
     auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
@@ -229,17 +231,18 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
 
       auto& dev_ctx = ctx.cuda_device_context();
       auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-      if (ctx.HasInput("RoisLod")) {
-        auto* rois_lod = ctx.Input<Tensor>("RoisLod");
-        int rois_batch_size = rois_lod->numel();
-        std::vector<int64_t> rois_lod_(rois_batch_size);
-        memory::Copy(cplace, rois_lod_.data(), gplace,
-                     rois_lod->data<int64_t>(),
-                     sizeof(int64_t) * rois_batch_size, 0);
-        for (int n = 0; n < rois_batch_size - 1; ++n) {
-          for (size_t i = rois_lod_[n]; i < rois_lod_[n + 1]; ++i) {
+      if (ctx.HasInput("RoisNum")) {
+        auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
+        int rois_batch_size = rois_num_t->numel();
+        std::vector<int> rois_num_list(rois_batch_size);
+        memory::Copy(cplace, rois_num_list.data(), gplace,
+                     rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
+        int start = 0;
+        for (int n = 0; n < rois_batch_size; ++n) {
+          for (int i = start; i < start + rois_num_list[n]; ++i) {
             roi_batch_id_data[i] = n;
           }
+          start += rois_num_list[n];
         }
       } else {
         auto rois_lod = rois->lod().back();
diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h
index 145b170dedf0613328223526b0a40a3c064f3028..40de6d0cf6abbcc4a1505cb6eb121ca70813c780 100644
--- a/paddle/fluid/operators/roi_pool_op.h
+++ b/paddle/fluid/operators/roi_pool_op.h
@@ -58,18 +58,20 @@ class CPUROIPoolOpKernel : public framework::OpKernel<T> {
         roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
 
     int rois_batch_size;
-    if (ctx.HasInput("RoisLod")) {
-      auto* rois_lod_t = ctx.Input<framework::Tensor>("RoisLod");
-      rois_batch_size = rois_lod_t->numel();
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
+      rois_batch_size = rois_num_t->numel();
       PADDLE_ENFORCE_EQ(
-          rois_batch_size - 1, batch_size,
+          rois_batch_size, batch_size,
           platform::errors::InvalidArgument("The rois_batch_size and imgs "
                                             "batch_size must be the same."));
-      auto* rois_lod = rois_lod_t->data<int64_t>();
-      for (int n = 0; n < rois_batch_size - 1; ++n) {
-        for (int i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+      auto* rois_num_data = rois_num_t->data<int>();
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_data[n]; ++i) {
           roi_batch_id_data[i] = n;
         }
+        start += rois_num_data[n];
       }
     } else {
       auto rois_lod = rois->lod().back();
@@ -185,14 +187,16 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
           roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
 
       int rois_batch_size;
-      if (ctx.HasInput("RoisLod")) {
-        auto* rois_lod_t = ctx.Input<framework::Tensor>("RoisLod");
-        rois_batch_size = rois_lod_t->numel();
-        auto* rois_lod = rois_lod_t->data<int64_t>();
-        for (int n = 0; n < rois_batch_size - 1; ++n) {
-          for (int i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+      if (ctx.HasInput("RoisNum")) {
+        auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
+        rois_batch_size = rois_num_t->numel();
+        auto* rois_num_data = rois_num_t->data<int>();
+        int start = 0;
+        for (int n = 0; n < rois_batch_size; ++n) {
+          for (int i = start; i < start + rois_num_data[n]; ++i) {
             roi_batch_id_data[i] = n;
           }
+          start += rois_num_data[n];
         }
       } else {
         auto rois_lod = rois->lod().back();
diff --git a/paddle/fluid/operators/run_program_op.cc b/paddle/fluid/operators/run_program_op.cc
index 04559a93c866c72f2d0b309a5005557134355666..2d599716443901053aa3d5dc8e93759320175b24 100644
--- a/paddle/fluid/operators/run_program_op.cc
+++ b/paddle/fluid/operators/run_program_op.cc
@@ -27,9 +27,6 @@ class RunProgramOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx->HasInputs("X"), true,
                       platform::errors::NotFound(
                           "Input(X) of RunProgramOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInputs("Params"), true,
-                      platform::errors::NotFound(
-                          "Input(Params) of RunProgramOp should not be null."));
     PADDLE_ENFORCE_EQ(ctx->HasOutputs("Out"), true,
                       platform::errors::NotFound(
                           "Output(Out) of RunProgramOp should not be null."));
@@ -73,7 +70,8 @@ class RunProgramOpMaker : public framework::OpProtoAndCheckerMaker {
              "(vector<LoDTensor or SelecetedRows>)"
              "The input parameter of RunProgram operator, also the parameters "
              "of the loaded program.")
-        .AsDuplicable();
+        .AsDuplicable()
+        .AsDispensable();
     AddOutput("Out",
               "(vector<LoDTensor>)"
               "The output tensors of RunProgram operator, also the fetch "
@@ -121,10 +119,6 @@ class RunProgramGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx->HasInputs("X"), true,
                       platform::errors::NotFound(
                           "Input(X) of RunProgramGradOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInputs("Params"), true,
-        platform::errors::NotFound(
-            "Input(Params) of RunProgramGradOp should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInputs(framework::GradVarName("Out")), true,
         platform::errors::NotFound(
diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h
index 1c493fc6be093a2af8f58c8e78d1be43de34306f..5afe25cf687fc96d1eaac33b2d0516c96c394a46 100644
--- a/paddle/fluid/operators/run_program_op.h
+++ b/paddle/fluid/operators/run_program_op.h
@@ -209,9 +209,14 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
     auto output_vars = ctx.MultiOutputVar("Out");
 
     auto input_var_names = ctx.InputNames("X");
-    auto param_names = ctx.InputNames("Params");
     auto output_var_names = ctx.OutputNames("Out");
 
+    // current program may not hold parameters
+    std::vector<std::string> param_names;
+    if (!param_vars.empty()) {
+      param_names = ctx.InputNames("Params");
+    }
+
     auto *block = ctx.Attr<BlockDesc *>("global_block");
     auto *program = block->Program();
     auto start_op_index = ctx.Attr<int64_t>("start_op_index");
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
index 64ee868fb6d8b1cf55f6400a28c10038efc7884e..11c81d23b2ed271ce89e6a27b1179e7d06dd0ebd 100644
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -60,7 +60,10 @@ class ScaleKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(in->place());
 
     PADDLE_ENFORCE_EQ(in->dims(), out->dims(),
-                      "in and out should have the same dim");
+                      paddle::platform::errors::InvalidArgument(
+                          "the input and output should have the same dim"
+                          "but input dim is %s, output dim is %s",
+                          in->dims(), out->dims()));
 
     auto eigen_out = framework::EigenVector<T>::Flatten(*out);
     auto eigen_in = framework::EigenVector<T>::Flatten(*in);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.h b/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
index 013170199a6bbe0246406b9c35a326bd063875a9..1186ed891e8c080c023aae5076cf1cb086fbc231 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
@@ -92,9 +92,11 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
     auto& x_lod = x->lod();
     auto& y_lod = y->lod();
 
-    PADDLE_ENFORCE_EQ(y_lod.empty(), false,
-                      "Input(Y) Tensor of SequenceExpandOp does not contain "
-                      "LoD information.");
+    PADDLE_ENFORCE_EQ(
+        y_lod.empty(), false,
+        platform::errors::InvalidArgument(
+            "Input(Y) Tensor of SequenceExpandOp does not contain "
+            "LoD information."));
 
     if (ref_level == -1) ref_level = y_lod.size() - 1;
 
diff --git a/paddle/fluid/operators/size_op.cc b/paddle/fluid/operators/size_op.cc
index b45fa7c791ff22be422ce12a8348a071c60ddd0f..70733d643673ad8acde9a45f273a52a9723fb0d3 100644
--- a/paddle/fluid/operators/size_op.cc
+++ b/paddle/fluid/operators/size_op.cc
@@ -53,7 +53,7 @@ REGISTER_OPERATOR(
     size, ops::SizeOp, ops::SizeOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(size, ops::SizeKernel<int>, ops::SizeKernel<int32_t>,
+REGISTER_OP_CPU_KERNEL(size, ops::SizeKernel<int>, ops::SizeKernel<int64_t>,
                        ops::SizeKernel<paddle::platform::float16>,
                        ops::SizeKernel<float>, ops::SizeKernel<double>,
                        ops::SizeKernel<bool>);
diff --git a/paddle/fluid/operators/size_op.cu b/paddle/fluid/operators/size_op.cu
index 3ea3032693236d5618ff6f0c858cbd85e34633ab..de56ecd95270577689f699462b9273b43f34595e 100644
--- a/paddle/fluid/operators/size_op.cu
+++ b/paddle/fluid/operators/size_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 
 REGISTER_OP_CUDA_KERNEL(
     size, paddle::operators::SizeKernel<int>,
-    paddle::operators::SizeKernel<int32_t>,
+    paddle::operators::SizeKernel<int64_t>,
     paddle::operators::SizeKernel<paddle::platform::float16>,
     paddle::operators::SizeKernel<float>, paddle::operators::SizeKernel<bool>,
     paddle::operators::SizeKernel<double>);
diff --git a/paddle/fluid/operators/size_op.h b/paddle/fluid/operators/size_op.h
index fb44070897156ef88062231322e28a2db1f244a7..e8c53d6e683305bfc1ff7c052a2dc54ecf465936 100644
--- a/paddle/fluid/operators/size_op.h
+++ b/paddle/fluid/operators/size_op.h
@@ -26,8 +26,18 @@ class SizeKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in_t = ctx.Input<Tensor>("Input");
     auto* out_t = ctx.Output<Tensor>("Out");
-    auto out_data = out_t->mutable_data<int64_t>(platform::CPUPlace());
-    out_data[0] = in_t->numel();
+    auto place = ctx.GetPlace();
+    auto out_data = out_t->mutable_data<int64_t>(place);
+    auto cpu_place = platform::CPUPlace();
+    if (place == cpu_place) {
+      out_data[0] = in_t->numel();
+    } else {
+      Tensor cpu_tensor;
+      auto cpu_data =
+          cpu_tensor.mutable_data<int64_t>(out_t->dims(), cpu_place);
+      cpu_data[0] = in_t->numel();
+      TensorCopy(cpu_tensor, place, out_t);
+    }
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index b06e8202cc79f017e26e3c8339ad05951a5a2bf7..52c4c63b473c443bb97fb7962179ce27e06fb16c 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -186,10 +186,17 @@ class SumOp : public framework::OperatorWithKernel {
           }
         }
       }
-      PADDLE_THROW("Cannot find the input data type by all input data");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected each tensor in Input(x) in sum op has be initialized, but "
+          "some tensor in Input(x) is not be initialized, please check your "
+          "code.",
+          framework::ToTypeName(x_vars[0]->Type())));
     }
-    PADDLE_THROW("Unexpected branch. Input type is %s",
-                 framework::ToTypeName(x_vars[0]->Type()));
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Expected type of Input(X) must be Tensor,  SelectedRows or "
+        "LodTensorArray. But got "
+        "unsupport type: %s.",
+        framework::ToTypeName(x_vars[0]->Type())));
   }
 };
 
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
index d0bf3a0abf58c47720216bd839eb84260ac207d8..6034cda50c32a857d8a501bf243a91df2f966eea 100644
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -169,8 +169,18 @@ void SumToLoDTensor(const framework::ExecutionContext &context) {
       auto row_numel = sr_value.numel() / sr_rows.size();
       auto out_dims = out->dims();
 
-      PADDLE_ENFORCE_EQ(sr.height(), out_dims[0]);
-      PADDLE_ENFORCE_EQ(row_numel, out->numel() / sr.height());
+      PADDLE_ENFORCE_EQ(sr.height(), out_dims[0],
+                        platform::errors::InvalidArgument(
+                            "The table height of input must be same as output, "
+                            "but received input height is %d"
+                            ", output height is %d",
+                            sr.height(), out_dims[0]));
+      PADDLE_ENFORCE_EQ(row_numel, out->numel() / sr.height(),
+                        platform::errors::InvalidArgument(
+                            "The table width of input must be same as output, "
+                            "but received input width is %d"
+                            ", output width is %d",
+                            row_numel, out->numel() / sr.height()));
 
       auto *sr_data = sr_value.data<T>();
       auto *sr_out_data = out->data<T>();
@@ -231,8 +241,11 @@ class SumKernel<platform::CUDADeviceContext, T>
     } else if (out_var->IsType<framework::LoDTensorArray>()) {
       LodTensorArrayCompute<platform::CUDADeviceContext, T>(context);
     } else {
-      PADDLE_THROW("Unexpected branch, output variable type is %s",
-                   framework::ToTypeName(out_var->Type()));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected type of Ouput(out) must be Tensor,  SelectedRows or "
+          "LodTensorArray. But got "
+          "unsupport type: %s.",
+          framework::ToTypeName(out_var->Type())));
     }
   }
 };
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
index 6847a81377979ab05aec03f43ba08fbec646d974..4c8f7be6ea26394bd3143058260c1fc94ce1e7e1 100644
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -182,7 +182,11 @@ class SumKernel : public framework::OpKernel<T> {
           auto &in_t = in_vars[i]->Get<framework::SelectedRows>();
           functor(context.template device_context<DeviceContext>(), in_t, out);
         } else {
-          PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Expected type of Input(X) of %d-th must be Tensor, "
+              "SelectedRows. But got "
+              "unsupport type: %s.",
+              framework::ToTypeName(in_vars[i]->Type())));
         }
       }
     } else if (out_var->IsType<framework::SelectedRows>()) {
@@ -190,8 +194,11 @@ class SumKernel : public framework::OpKernel<T> {
     } else if (out_var->IsType<framework::LoDTensorArray>()) {
       LodTensorArrayCompute<DeviceContext, T>(context);
     } else {
-      PADDLE_THROW("Unexpected branch, output variable type is %s",
-                   framework::ToTypeName(out_var->Type()));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected type of Output(out) must be Tensor, SelectedRows, "
+          "LoDTensorArray. But got "
+          "unsupport type: %s.",
+          framework::ToTypeName(out_var->Type())));
     }
   }
 };
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index cc6ee7b19ea99fe61ef00beaf475fc35c8a0a809..9cfe47da5db7ba15c9b24a8d551606f805ad9b15 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -208,8 +208,11 @@ class TensorRTEngineOp : public framework::OperatorBase {
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
 
-    PADDLE_ENFORCE_EQ(input_names_.empty(), false,
-                      "should pass at least one input");
+    PADDLE_ENFORCE_EQ(
+        input_names_.empty(), false,
+        platform::errors::PreconditionNotMet(
+            "TensorRT engine needs at least one input, but no input is found. "
+            "Please check if you set the input correctly."));
 
     std::vector<std::string> output_maps =
         Attr<std::vector<std::string>>("output_name_mapping");
@@ -295,12 +298,19 @@ class TensorRTEngineOp : public framework::OperatorBase {
 #endif
       }
       auto *fluid_v = scope.FindVar(y);
-      PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
+      PADDLE_ENFORCE_NOT_NULL(
+          fluid_v,
+          platform::errors::NotFound(
+              "Output variable %s is not found in TensorRT subgraph.", y));
       auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
       fluid_t->Resize(framework::make_ddim(ddim));
 
-      PADDLE_ENFORCE(bind_index < num_bindings,
-                     "The bind index should be less than num_bindings");
+      PADDLE_ENFORCE_LT(bind_index, num_bindings,
+                        platform::errors::InvalidArgument(
+                            "The binding index in TRT engine should be less "
+                            "than the number of bindings, but got binding "
+                            "index = %d, number of bindings = %d.",
+                            bind_index, num_bindings));
       buffers[bind_index] = static_cast<void *>(fluid_t->mutable_data<float>(
           BOOST_GET_CONST(platform::CUDAPlace, dev_place)));
 
diff --git a/paddle/fluid/operators/tile_op.cc b/paddle/fluid/operators/tile_op.cc
index da4ca87296d92fc1052f462ae6ee8a3acb05eb49..bc1cb3b4aa1c1bdd0a9be39a4e113301d65ce5b5 100644
--- a/paddle/fluid/operators/tile_op.cc
+++ b/paddle/fluid/operators/tile_op.cc
@@ -241,6 +241,26 @@ class TileGradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
+template <typename T>
+class TileDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("tile");
+    op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    if (this->HasInput("repeat_times_tensor")) {
+      op->SetInput("repeat_times_tensor", this->Input("repeat_times_tensor"));
+    }
+    if (this->HasInput("RepeatTimes")) {
+      op->SetInput("RepeatTimes", this->Input("RepeatTimes"));
+    }
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
 DECLARE_NO_NEED_BUFFER_VARS_INFERER(TileGradNoNeedBufVarsInferer, "X");
 
 }  // namespace operators
@@ -251,6 +271,8 @@ REGISTER_OPERATOR(tile, ops::TileOp, ops::TileOpMaker,
                   ops::TileGradOpMaker<paddle::framework::OpDesc>,
                   ops::TileGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(tile_grad, ops::TileGradOp,
+                  ops::TileDoubleGradOpMaker<paddle::framework::OpDesc>,
+                  ops::TileDoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::TileGradNoNeedBufVarsInferer);
 REGISTER_OP_CPU_KERNEL(
     tile, ops::TileKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 9cffe09a33abf29308072d6b3c8bfb8a636048da..6efada4343ca54c0d56f98cae20963bf0182f47b 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -54,9 +54,11 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
       tensor = out_var->GetMutable<framework::LoDTensor>();
       if (!new_shape.empty()) tensor->Resize(framework::make_ddim(new_shape));
     } else {
-      PADDLE_THROW(
-          "uniform_random_op's output only"
-          "supports SelectedRows and LoDTensor");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected type of Output(out) in uniform_random_op must be Tensor, "
+          "SelectedRows. But got "
+          "unsupport type: %s.",
+          framework::ToTypeName(out_var->Type())));
     }
     T *data = tensor->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 6237137cccbc6840b345c9e26dda1ccdc8df43b0..563a6c165b748543516eabbcdb0e1c8b9be8a44d 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -116,9 +116,11 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
       tensor = out_var->GetMutable<framework::LoDTensor>();
       if (!new_shape.empty()) tensor->Resize(framework::make_ddim(new_shape));
     } else {
-      PADDLE_THROW(
-          "uniform_random_op's output only"
-          "supports SelectedRows and LoDTensor");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected type of Output(out) in uniform_random_op must be Tensor, "
+          "SelectedRows. But got "
+          "unsupport type: %s.",
+          framework::ToTypeName(out_var->Type())));
     }
     T* data = tensor->mutable_data<T>(context.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index d263dd03dd0de0d1b12925d0c3ec428b6730ef2e..6052e533643f3c4e5be977a87fceafa932892862 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -50,7 +50,10 @@ inline std::vector<int64_t> GetNewDataFromShapeTensor(
     }
     return vec_new_data;
   } else {
-    PADDLE_THROW("The dtype of shape tensor must be int32 or int64.");
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Expected dtype of ShapeTensor must be int32, int64. But got "
+        "unsupport dtype: %s.",
+        paddle::framework::DataTypeToString(new_data_tensor->type())));
   }
 }
 
@@ -84,7 +87,11 @@ inline std::vector<int64_t> GetNewDataFromShapeTensorList(
         vec_new_shape.push_back(*tensor->data<int64_t>());
       }
     } else {
-      PADDLE_THROW("The dtype of shape tensor must be int32 or int64.");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected dtype of ShapeTensorList of %d-th must be int32, int64. "
+          "But got "
+          "unsupport dtype: %s.",
+          i, paddle::framework::DataTypeToString(tensor->type())));
     }
   }
 
diff --git a/paddle/fluid/operators/utils.h b/paddle/fluid/operators/utils.h
index e53981a53653a4830a39ceae47f4024bb757b039..aec995304a77118ecbf788ca3984c7e9da531f18 100644
--- a/paddle/fluid/operators/utils.h
+++ b/paddle/fluid/operators/utils.h
@@ -81,5 +81,26 @@ inline std::vector<T> GetDataFromTensorList(
   }
   return vec_new_data;
 }
+
+inline framework::DDim GetShape(const framework::ExecutionContext& ctx) {
+  // 1. shape is a Tensor
+  if (ctx.HasInput("ShapeTensor")) {
+    auto* shape_tensor = ctx.Input<framework::LoDTensor>("ShapeTensor");
+    auto vec_shape = GetDataFromTensor<int>(shape_tensor);
+    return framework::make_ddim(vec_shape);
+  }
+
+  // 2. shape is a list/tuple containing Tensor
+  auto shape_tensor_list = ctx.MultiInput<framework::Tensor>("ShapeTensorList");
+  if (shape_tensor_list.size() > 0) {
+    auto vec_shape = GetDataFromTensorList(shape_tensor_list);
+    return framework::make_ddim(vec_shape);
+  }
+
+  // 3. shape is a list/tuple without containing Tensor
+  auto vec_shape = ctx.Attr<std::vector<int64_t>>("shape");
+  return framework::make_ddim(vec_shape);
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index bbe847e7190d6f9812dcc814d4b4fe74a0cc7ef6..bb4c2a89f6fa5e531aa322b69218cf58d3e94285 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -287,6 +287,8 @@ class ScopedTensorDescriptor {
     return descriptor(CudnnDataType<T>::type, dim, stride);
   }
 
+  inline cudnnTensorDescriptor_t desc() { return desc_; }
+
  private:
   cudnnTensorDescriptor_t desc_;
   DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor);
@@ -329,6 +331,8 @@ class ScopedRNNTensorDescriptor {
                       input_size, time_major, seq_length);
   }
 
+  inline cudnnRNNDataDescriptor_t desc() { return desc_; }
+
  private:
   cudnnRNNDataDescriptor_t desc_;
   DISABLE_COPY_AND_ASSIGN(ScopedRNNTensorDescriptor);
@@ -361,6 +365,7 @@ class ScopedDropoutDescriptor {
     }
     return desc_;
   }
+  inline cudnnDropoutDescriptor_t desc() { return desc_; }
 
  private:
   cudnnDropoutDescriptor_t desc_;
@@ -376,7 +381,7 @@ class ScopedRNNDescriptor {
     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyRNNDescriptor(desc_));
   }
 
-  inline cudnnRNNDescriptor_t descriptor() { return desc_; }
+  inline cudnnRNNDescriptor_t desc() { return desc_; }
 
  private:
   cudnnRNNDescriptor_t desc_;
@@ -419,172 +424,13 @@ class ScopedFilterDescriptor {
                       kernel, groups);
   }
 
+  inline cudnnFilterDescriptor_t desc() { return desc_; }
+
  private:
   cudnnFilterDescriptor_t desc_;
   DISABLE_COPY_AND_ASSIGN(ScopedFilterDescriptor);
 };
 
-class ScopedRNNBase {
- public:
-  ScopedRNNBase(int seq_length, int batch_size, int input_size, int hidden_size,
-                int num_layers, float dropout_prob, int seed, int weight_numel,
-                bool initialized, bool is_bidirec)
-      : seq_length_(seq_length),
-        batch_size_(batch_size),
-        input_size_(input_size),
-        hidden_size_(hidden_size),
-        num_layers_(num_layers),
-        dropout_prob_(dropout_prob),
-        seed_(seed),
-        weight_numel_(weight_numel),
-        initialized_(initialized),
-        is_bidirec_(is_bidirec) {}
-
-  template <typename T>
-  void Create(const cudnnHandle_t& handle, const platform::Place& place,
-              std::vector<int> sequence_length, size_t* workspace_size,
-              size_t* reserve_size, framework::Tensor* dropout_state) {
-    int numDirections = is_bidirec_ ? 2 : 1;
-    cudnnDataType_t cudnn_type = platform::CudnnDataType<T>::type;
-
-    // ------------------- cudnn x, y descriptors ---------------------
-    std::vector<int> dims_x = {batch_size_, input_size_, 1};
-    std::vector<int> strides_x = {input_size_, 1, 1};
-
-    std::vector<int> dims_y = {batch_size_, hidden_size_ * numDirections, 1};
-    std::vector<int> strides_y = {hidden_size_ * numDirections, 1, 1};
-
-    for (int i = 0; i < seq_length_; ++i) {
-      x_desc_.emplace_back(x_d.descriptor<T>(dims_x, strides_x));
-      y_desc_.emplace_back(y_d.descriptor<T>(dims_y, strides_y));
-    }
-
-    if (!sequence_length.empty()) {
-      x_seq_desc_ = x_seq_d.descriptor<T>(seq_length_, batch_size_, input_size_,
-                                          true, sequence_length);
-      y_seq_desc_ = y_seq_d.descriptor<T>(seq_length_, batch_size_,
-                                          hidden_size_ * numDirections, true,
-                                          sequence_length);
-    }
-
-    // ------------------- cudnn hx, hy, cx, cy descriptors----------
-    std::vector<int> dims_hx = {num_layers_ * numDirections, batch_size_,
-                                hidden_size_};
-    std::vector<int> strides_hx = {hidden_size_ * batch_size_, hidden_size_, 1};
-
-    hx_desc_ = hx_d.descriptor<T>(dims_hx, strides_hx);
-    cx_desc_ = cx_d.descriptor<T>(dims_hx, strides_hx);
-    hy_desc_ = hy_d.descriptor<T>(dims_hx, strides_hx);
-    cy_desc_ = cy_d.descriptor<T>(dims_hx, strides_hx);
-
-    // ------------------- cudnn dropout descriptors ---------------------
-    size_t state_size;
-    if (!initialized_) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          dynload::cudnnDropoutGetStatesSize(handle, &state_size));
-      dropout_state->mutable_data<uint8_t>({static_cast<int64_t>(state_size)},
-                                           place);
-    }
-    dropout_desc_ =
-        dropout_d.descriptor(handle, place, initialized_, dropout_prob_,
-                             dropout_state, seed_, state_size);
-
-    // ------------------- cudnn rnn descriptors ---------------------
-    rnn_desc_ = rnn_d.descriptor();
-
-#if CUDNN_VERSION >= 6000
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
-        handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
-        CUDNN_LINEAR_INPUT,
-        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
-        CUDNN_RNN_ALGO_STANDARD, cudnn_type));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor(
-        rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT,
-        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
-        cudnn_type));
-#endif
-    if (!sequence_length.empty()) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode(
-          rnn_desc_, CUDNN_RNN_PADDED_IO_ENABLED));
-    }
-    // ------------------- cudnn weights_size ---------------------
-    size_t weights_size_;
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNParamsSize(
-        handle, rnn_desc_, x_desc_[0], &weights_size_, cudnn_type));
-
-    PADDLE_ENFORCE_EQ(
-        weights_size_, sizeof(T) * weight_numel_,
-        platform::errors::InvalidArgument(
-            "The cudnn lstm and setting weight size should be same."));
-
-    // ------------------- cudnn weight descriptors ---------------------
-    platform::DataLayout layout = platform::DataLayout::kNCHW;
-    int dim_tmp = weights_size_ / sizeof(T);
-    std::vector<int> dim_w = {dim_tmp, 1, 1};
-    w_desc_ = w_d.descriptor<T>(layout, dim_w);
-
-    // ------------------- cudnn workspace, reserve size ---------------------
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize(
-        handle, rnn_desc_, seq_length_, x_desc_.data(), workspace_size));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::cudnnGetRNNTrainingReserveSize(
-            handle, rnn_desc_, seq_length_, x_desc_.data(), reserve_size));
-  }
-
-  cudnnTensorDescriptor_t* x_desc() { return x_desc_.data(); }
-  cudnnTensorDescriptor_t* y_desc() { return y_desc_.data(); }
-  cudnnRNNDataDescriptor_t x_seq_desc() { return x_seq_desc_; }
-  cudnnRNNDataDescriptor_t y_seq_desc() { return y_seq_desc_; }
-  cudnnTensorDescriptor_t hx_desc() { return hx_desc_; }
-  cudnnTensorDescriptor_t cx_desc() { return cx_desc_; }
-  cudnnTensorDescriptor_t hy_desc() { return hy_desc_; }
-  cudnnTensorDescriptor_t cy_desc() { return cy_desc_; }
-  cudnnRNNDescriptor_t rnn_desc() { return rnn_desc_; }
-  cudnnDropoutDescriptor_t dropout_desc() { return dropout_desc_; }
-  cudnnFilterDescriptor_t w_desc() { return w_desc_; }
-
- private:
-  int seq_length_;
-  int batch_size_;
-  int input_size_;
-  int hidden_size_;
-  int num_layers_;
-  float dropout_prob_;
-  int seed_;
-  int weight_numel_;
-  bool initialized_;
-  bool is_bidirec_;
-
-  std::vector<cudnnTensorDescriptor_t> x_desc_;
-  std::vector<cudnnTensorDescriptor_t> y_desc_;
-  cudnnRNNDataDescriptor_t x_seq_desc_;
-  cudnnRNNDataDescriptor_t y_seq_desc_;
-  // A tensor descriptor describing the initial hidden state of the RNN.
-  cudnnTensorDescriptor_t hx_desc_;
-  // A tensor descriptor describing the initial cell state for LSTM networks.
-  cudnnTensorDescriptor_t cx_desc_;
-  // A tensor descriptor describing the final hidden state of the RNN.
-  cudnnTensorDescriptor_t hy_desc_;
-  // A tensor descriptor describing the final cell state for LSTM networks.
-  cudnnTensorDescriptor_t cy_desc_;
-  cudnnDropoutDescriptor_t dropout_desc_;
-  cudnnFilterDescriptor_t w_desc_;
-  cudnnRNNDescriptor_t rnn_desc_;
-
-  ScopedTensorDescriptor x_d;
-  ScopedTensorDescriptor y_d;
-  ScopedRNNTensorDescriptor x_seq_d;
-  ScopedRNNTensorDescriptor y_seq_d;
-  ScopedTensorDescriptor hx_d;
-  ScopedTensorDescriptor cx_d;
-  ScopedTensorDescriptor hy_d;
-  ScopedTensorDescriptor cy_d;
-  ScopedDropoutDescriptor dropout_d;
-  ScopedFilterDescriptor w_d;
-  ScopedRNNDescriptor rnn_d;
-};
-
 class ScopedConvolutionDescriptor {
  public:
   ScopedConvolutionDescriptor() {
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 8fb66c6f34bd8453f1aceb731bb1cd94b8e75a69..b012a103ea3031efb381d7039b15e82b2af52bf7 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -443,6 +443,13 @@ inline bool HasOpINT8DataType(const paddle::framework::OpDesc* op) {
           op->GetAttrIfExists<bool>("use_quantizer"));
 }
 
+inline bool HasOpBFLOAT16DataType(const paddle::framework::OpDesc* op) {
+  return op->GetAttrIfExists<std::string>("mkldnn_data_type") == "bfloat16";
+}
+
+inline bool HasOpFLOAT32DataType(const paddle::framework::OpDesc* op) {
+  return op->GetAttrIfExists<std::string>("mkldnn_data_type") == "float32";
+}
 enum class RNNReorderType { PP_NTC, PP_TNC, NTC_PP, TNC_PP };
 
 }  // namespace platform
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index d733cf26ed209bcb86eaf2d366e45cfa0e7f9a90..92d9473141009216e3c7e64ccb793884dc67aadc 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -38,6 +38,7 @@ set(PYBIND_SRCS
   imperative.cc
   ir.cc
   inference_api.cc
+  compatible.cc
   generator_py.cc)
 
 if(WITH_GLOO)
diff --git a/paddle/fluid/pybind/compatible.cc b/paddle/fluid/pybind/compatible.cc
new file mode 100644
index 0000000000000000000000000000000000000000..971d230458db4bc2196ca529e01b0586da79567c
--- /dev/null
+++ b/paddle/fluid/pybind/compatible.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pybind/compatible.h"
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace py = pybind11;
+
+using paddle::framework::compatible::PassVersionCheckerRegistrar;
+
+namespace paddle {
+namespace pybind {
+
+void BindCompatible(py::module* m) {
+  py::class_<PassVersionCheckerRegistrar>(*m, "PassVersionChecker")
+      .def_static("IsCompatible", [](const std::string& name) -> bool {
+        auto instance = PassVersionCheckerRegistrar::GetInstance();
+        return instance.IsPassCompatible(name);
+      });
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/compatible.h b/paddle/fluid/pybind/compatible.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9d4cf5888fee8f62ce2e64636da6b98542b1a75
--- /dev/null
+++ b/paddle/fluid/pybind/compatible.h
@@ -0,0 +1,23 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace paddle {
+namespace pybind {
+void BindCompatible(pybind11::module *m);
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 040dd313f1c538b5792538f9da04635ff805b9a8..be4d90597e1e1c647ac6750ee7cebdc2ede8a551 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -60,6 +60,9 @@ void BindAnalysisConfig(py::module *m);
 void BindAnalysisPredictor(py::module *m);
 void BindZeroCopyTensor(py::module *m);
 void BindPaddlePassBuilder(py::module *m);
+void BindPaddleInferPredictor(py::module *m);
+void BindPaddleInferTensor(py::module *m);
+void BindPredictorPool(py::module *m);
 
 #ifdef PADDLE_WITH_MKLDNN
 void BindMkldnnQuantizerConfig(py::module *m);
@@ -139,6 +142,15 @@ void ZeroCopyTensorCreate(ZeroCopyTensor &tensor,  // NOLINT
   tensor.copy_from_cpu(static_cast<const T *>(data.data()));
 }
 
+template <typename T>
+void PaddleInferTensorCreate(paddle_infer::Tensor &tensor,  // NOLINT
+                             py::array_t<T> data) {
+  std::vector<int> shape;
+  std::copy_n(data.shape(), data.ndim(), std::back_inserter(shape));
+  tensor.Reshape(std::move(shape));
+  tensor.CopyFromCpu(static_cast<const T *>(data.data()));
+}
+
 size_t PaddleGetDTypeSize(PaddleDType dt) {
   size_t size{0};
   switch (dt) {
@@ -183,6 +195,30 @@ py::array ZeroCopyTensorToNumpy(ZeroCopyTensor &tensor) {  // NOLINT
   return array;
 }
 
+py::array PaddleInferTensorToNumpy(paddle_infer::Tensor &tensor) {  // NOLINT
+  py::dtype dt = PaddleDTypeToNumpyDType(tensor.type());
+  auto tensor_shape = tensor.shape();
+  py::array::ShapeContainer shape(tensor_shape.begin(), tensor_shape.end());
+  py::array array(dt, std::move(shape));
+
+  switch (tensor.type()) {
+    case PaddleDType::INT32:
+      tensor.CopyToCpu(static_cast<int32_t *>(array.mutable_data()));
+      break;
+    case PaddleDType::INT64:
+      tensor.CopyToCpu(static_cast<int64_t *>(array.mutable_data()));
+      break;
+    case PaddleDType::FLOAT32:
+      tensor.CopyToCpu<float>(static_cast<float *>(array.mutable_data()));
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported data type. Now only supports INT32, INT64 and "
+          "FLOAT32."));
+  }
+  return array;
+}
+
 py::bytes SerializePDTensorToBytes(PaddleTensor &tensor) {  // NOLINT
   std::stringstream ss;
   paddle::inference::SerializePDTensorToStream(&ss, tensor);
@@ -200,8 +236,11 @@ void BindInferenceApi(py::module *m) {
   BindNativePredictor(m);
   BindAnalysisConfig(m);
   BindAnalysisPredictor(m);
+  BindPaddleInferPredictor(m);
   BindZeroCopyTensor(m);
+  BindPaddleInferTensor(m);
   BindPaddlePassBuilder(m);
+  BindPredictorPool(m);
 #ifdef PADDLE_WITH_MKLDNN
   BindMkldnnQuantizerConfig(m);
 #endif
@@ -209,8 +248,17 @@ void BindInferenceApi(py::module *m) {
          &paddle::CreatePaddlePredictor<AnalysisConfig>, py::arg("config"));
   m->def("create_paddle_predictor",
          &paddle::CreatePaddlePredictor<NativeConfig>, py::arg("config"));
+  m->def("create_predictor", [](const paddle_infer::Config &config)
+                                 -> std::unique_ptr<paddle_infer::Predictor> {
+                                   auto pred =
+                                       std::unique_ptr<paddle_infer::Predictor>(
+                                           new paddle_infer::Predictor(config));
+                                   return std::move(pred);
+                                 });
   m->def("paddle_dtype_size", &paddle::PaddleDtypeSize);
   m->def("paddle_tensor_to_bytes", &SerializePDTensorToBytes);
+  m->def("get_version", &paddle_infer::GetVersion);
+  m->def("get_num_bytes_of_data_type", &paddle_infer::GetNumBytesOfDataType);
 }
 
 namespace {
@@ -525,6 +573,19 @@ void BindAnalysisPredictor(py::module *m) {
            py::arg("dir"));
 }
 
+void BindPaddleInferPredictor(py::module *m) {
+  py::class_<paddle_infer::Predictor>(*m, "PaddleInferPredictor")
+      .def(py::init<const paddle_infer::Config &>())
+      .def("get_input_names", &paddle_infer::Predictor::GetInputNames)
+      .def("get_output_names", &paddle_infer::Predictor::GetOutputNames)
+      .def("get_input_handle", &paddle_infer::Predictor::GetInputHandle)
+      .def("get_output_handle", &paddle_infer::Predictor::GetOutputHandle)
+      .def("run", &paddle_infer::Predictor::Run)
+      .def("clone", &paddle_infer::Predictor::Clone)
+      .def("clear_intermediate_tensor",
+           &paddle_infer::Predictor::ClearIntermediateTensor);
+}
+
 void BindZeroCopyTensor(py::module *m) {
   py::class_<ZeroCopyTensor>(*m, "ZeroCopyTensor")
       .def("reshape", &ZeroCopyTensor::Reshape)
@@ -538,6 +599,26 @@ void BindZeroCopyTensor(py::module *m) {
       .def("type", &ZeroCopyTensor::type);
 }
 
+void BindPaddleInferTensor(py::module *m) {
+  py::class_<paddle_infer::Tensor>(*m, "PaddleInferTensor")
+      .def("reshape", &paddle_infer::Tensor::Reshape)
+      .def("copy_from_cpu", &PaddleInferTensorCreate<int32_t>)
+      .def("copy_from_cpu", &PaddleInferTensorCreate<int64_t>)
+      .def("copy_from_cpu", &PaddleInferTensorCreate<float>)
+      .def("copy_to_cpu", &PaddleInferTensorToNumpy)
+      .def("shape", &paddle_infer::Tensor::shape)
+      .def("set_lod", &paddle_infer::Tensor::SetLoD)
+      .def("lod", &paddle_infer::Tensor::lod)
+      .def("type", &paddle_infer::Tensor::type);
+}
+
+void BindPredictorPool(py::module *m) {
+  py::class_<paddle_infer::services::PredictorPool>(*m, "PredictorPool")
+      .def(py::init<const paddle_infer::Config &, size_t>())
+      .def("retrive", &paddle_infer::services::PredictorPool::Retrive,
+           py::return_value_policy::reference);
+}
+
 void BindPaddlePassBuilder(py::module *m) {
   py::class_<PaddlePassBuilder>(*m, "PaddlePassBuilder")
       .def(py::init<const std::vector<std::string> &>())
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 256faf04ea6de5835f22113537caac49ca1dbab4..f751136640caad6acd3230bc22cd0e3f0fafe9fb 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -43,6 +43,11 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"nll_loss", {"X", "Label", "Weight"}},
     {"bilinear_tensor_product", {"X", "Y", "Weight", "Bias"}},
     {"gather", {"X", "Index", "Axis"}},
+    {"roi_pool", {"X", "ROIs", "RoisNum"}},
+    {"roi_align", {"X", "ROIs", "RoisNum"}},
+    {"collect_fpn_proposals",
+     {"MultiLevelRois", "MultiLevelScores", "MultiLevelRoIsNum"}},
+    {"distribute_fpn_proposals", {"FpnRois", "RoisNum"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
@@ -63,6 +68,10 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
      {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
       "ReserveSpace"}},
     {"unique", {"Out", "Index", "Indices", "Counts"}},
+    {"generate_proposals", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
+    {"collect_fpn_proposals", {"FpnRois", "RoisNum"}},
+    {"distribute_fpn_proposals",
+     {"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}},
 };
 
 // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
@@ -102,7 +111,9 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"fake_quantize_dequantize_moving_average_abs_max",
      {"Out", "OutScale", "OutAccum", "OutState"}},
     {"fake_quantize_dequantize_abs_max", {"Out", "OutScale"}},
-    {"amp_check_finite_and_scale", {"Out", "FoundInfinite"}},
+    {"check_finite_and_unscale", {"Out", "FoundInfinite"}},
+    {"update_loss_scaling",
+     {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}},
 };
 
 // clang-format off
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 9950eb9adc241ca5c82b4b0289dd57da4195e558..97056eca411f29e9a2c379cbcb2f88775242f692 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -184,6 +184,7 @@ void BindVarDsec(pybind11::module *m) {
       .value("FP16", pd::proto::VarType::FP16)
       .value("FP32", pd::proto::VarType::FP32)
       .value("FP64", pd::proto::VarType::FP64)
+      .value("BF16", pd::proto::VarType::BF16)
       .value("LOD_TENSOR", pd::proto::VarType::LOD_TENSOR)
       .value("SELECTED_ROWS", pd::proto::VarType::SELECTED_ROWS)
       .value("FEED_MINIBATCH", pd::proto::VarType::FEED_MINIBATCH)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 4b8f7c853ceaf2148722a9c65f38e0ec3d9f4df5..330254ecaafd29c00e8942765956ea065d2bb7cf 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -60,6 +60,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/pybind/box_helper_py.h"
+#include "paddle/fluid/pybind/compatible.h"
 #include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/pybind/data_set_py.h"
 #include "paddle/fluid/pybind/exception.h"
@@ -2619,6 +2620,7 @@ All parameter, weight, gradient are variables in Paddle.
   BindGraph(&m);
   BindNode(&m);
   BindInferenceApi(&m);
+  BindCompatible(&m);
   BindDataset(&m);
   BindGenerator(&m);
 #ifdef PADDLE_WITH_CRYPTO
diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt
index 7eab677fac1683fdc95c9e338b1099d78b5cabc3..235d92ac4f9e88947cea04425b0916b8a0290979 100644
--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
@@ -27,8 +27,6 @@ function(train_test TARGET_NAME)
         endif()
         set_tests_properties(test_train_${TARGET_NAME}${arg}
                 PROPERTIES DEPENDS test_${TARGET_NAME})
-        set_tests_properties(test_train_${TARGET_NAME}${arg}
-                PROPERTIES LABELS "RUN_TYPE=DIST")
         if(NOT WIN32 AND NOT APPLE)
             set_tests_properties(test_train_${TARGET_NAME}${arg}
                     PROPERTIES TIMEOUT 150)
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 27edf1f677aec452995d81ebcf8e8533b5343ce9..9e150763dbb30ec6196ce2e62d28f737f42185fb 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -51,6 +51,17 @@ if %ERRORLEVEL% NEQ 0 (
     exit /b 7
 )
 
+rem ------pre install clcache and init config----------
+pip install clcache
+:: set USE_CLCACHE to enable clcache
+set USE_CLCACHE=1
+:: In some scenarios, CLCACHE_HARDLINK can save one file copy.
+set CLCACHE_HARDLINK=1
+:: If it takes more than 1000s to obtain the right to use the cache, an error will be reported
+set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
+:: set maximum cache size to 20G
+clcache.exe -M 21474836480
+
 rem ------initialize common variable------
 if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0"
 if not defined BRANCH set BRANCH=develop
@@ -111,6 +122,8 @@ goto:success
 :CASE_wincheck_openblas
 set WITH_MKL=OFF
 set WITH_GPU=ON
+rem Temporarily turn off WITH_INFERENCE_API_TEST on GPU due to compile hang
+set WITH_INFERENCE_API_TEST=OFF
 call :cmake || goto cmake_error
 call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
@@ -152,6 +165,7 @@ echo    Step 2. Buile Paddle ...
 echo    ========================================
 call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64
 
+for /F %%# in ('wmic cpu get NumberOfLogicalProcessors^|findstr [0-9]') do set /a PARALLEL_PROJECT_COUNT=%%#*8/10
 set build_times=1
 :build_tp
 echo Build third_party the %build_times% time:
@@ -170,7 +184,7 @@ echo Build third_party successfully!
 set build_times=1
 :build_paddle
 echo Build Paddle the %build_times% time:
-msbuild /m /p:Configuration=Release /verbosity:minimal paddle.sln
+msbuild /m:%PARALLEL_PROJECT_COUNT% /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:minimal paddle.sln
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1
     if %build_times% GTR 2 (
@@ -242,7 +256,7 @@ dir %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin
 dir %THIRD_PARTY_PATH:/=\%\install\warpctc\bin
 
 set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;%THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;%THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH%
-ctest.exe --output-on-failure -C Release -j 8
+ctest.exe --output-on-failure -C Release -j 8 --repeat until-pass:4 after-timeout:4
 goto:eof
 
 :unit_test_error
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 9c1c95f37ed0785bfd770e7cbc02002daba8447b..ec07565c5af6c7ba79c15d9a335313775719c682 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -296,13 +296,13 @@ function check_style() {
     commit_files=on
     for file_name in `git diff --numstat upstream/$BRANCH |awk '{print $NF}'`;do
         if ! pre-commit run --files $file_name ; then
-            git diff
             commit_files=off
         fi
     done 
     
     if [ $commit_files == 'off' ];then
         echo "code format error"
+        git diff 2>&1
         exit 4
     fi
     trap : 0
@@ -528,9 +528,50 @@ EOF
         elif [ "$1" == "cp37-cp37m" ]; then
             pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         fi
+        tmpfile_rand=`date +%s%N`
+        tmpfile=$tmp_dir/$tmpfile_rand
         set +e
         ut_startTime_s=`date +%s`
-        ctest --output-on-failure -j $2;mactest_error=$?
+        ctest --output-on-failure -j $2 | tee $tmpfile
+        failed_test_lists=''
+        collect_failed_tests
+        set +x
+        mactest_error=0
+        retry_unittests_record=''
+        retry_time=3
+        exec_times=0
+        exec_time_array=('first' 'second' 'third')
+        if [ -n "$failed_test_lists" ];then
+            mactest_error=1
+            while ( [ $exec_times -lt $retry_time ] && [ -n "${failed_test_lists}" ] )
+                do
+                    retry_unittests_record="$retry_unittests_record$failed_test_lists"
+                    failed_test_lists_ult=`echo "${failed_test_lists}"`
+                    read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(" | sed 's/(//' | sed 's/- //' )
+                    echo "========================================="
+                    echo "This is the ${exec_time_array[$exec_times]} time to re-run"
+                    echo "========================================="
+                    echo "The following unittest will be re-run:"
+                    echo "${retry_unittests}"
+                    echo "========================================="
+
+                    retry_unittests_regular=''
+                    for line in ${retry_unittests[@]} ;
+                        do
+                            if [[ "$retry_unittests_regular" == "" ]];then
+                                retry_unittests_regular="^$line$"
+                            else
+                                retry_unittests_regular="$retry_unittests_regular|^$line$"
+                            fi
+                        done
+                    rm -f $tmp_dir/*
+                    failed_test_lists=''
+                    ctest -R "($retry_unittests_regular)" --output-on-failure -j $2 | tee $tmpfile
+                    collect_failed_tests
+                    exec_times=$[$exec_times+1]
+                done
+        fi
+        #mactest_error=$?
         ut_endTime_s=`date +%s`
         echo "Mac testCase Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
         paddle version
@@ -538,7 +579,21 @@ EOF
         export http_proxy=$my_proxy
         export https_proxy=$my_proxy
         if [ "$mactest_error" != 0 ];then
-            exit 8;
+            if [[ "$failed_test_lists" == "" ]]; then
+                echo "========================================"
+                echo "There are failed tests, which have been successful after re-run:"
+                echo "========================================"
+                echo "The following tests have been re-ran:"
+                echo "${retry_unittests_record}"
+            else
+                failed_test_lists_ult=`echo "${failed_test_lists}"`
+                echo "========================================"
+                echo "Summary Failed Tests... "
+                echo "========================================"
+                echo "The following tests FAILED: "
+                echo "${failed_test_lists_ult}"
+                exit 8;
+            fi
         fi
     fi
 }
@@ -562,6 +617,7 @@ function fetch_upstream_develop_if_not_exist() {
 function generate_upstream_develop_api_spec() {
     fetch_upstream_develop_if_not_exist
     cur_branch=`git branch | grep \* | cut -d ' ' -f2`
+    git checkout .
     git checkout -b develop_base_pr upstream/$BRANCH
     cmake_gen $1
     build $2
@@ -1391,7 +1447,7 @@ function example() {
     cd ${PADDLE_ROOT}/tools
     python sampcd_processor.py cpu;example_error=$?
     if [ "$example_error" != "0" ];then
-      echo "Code instance execution failed"
+      echo "Code instance execution failed" >&2
       exit 5
     fi
 }
@@ -1400,15 +1456,25 @@ function summary_check_problems() {
     set +x
     local check_style_code=$1
     local example_code=$2
+    local check_style_info=$3
+    local example_info=$4
     if [ $check_style_code -ne 0 -o $example_code -ne 0 ];then
       echo "========================================"
       echo "summary problems:"
+      if [ $check_style_code -ne 0 -a $example_code -ne 0 ];then
+        echo "There are 2 errors: Code format error and Example code error."
+      else
+        [ $check_style_code -ne 0 ] && echo "There is 1 error: Code format error."
+        [ $example_code -ne 0 ] && echo "There is 1 error: Example code error."
+      fi
       echo "========================================"
       if [ $check_style_code -ne 0 ];then
-        echo "- Check code style failed! Please check the log and fix problems."
+        echo "*****Code format error***** Please fix it according to the diff information:"
+        echo "$check_style_info" | grep "code format error" -A $(echo "$check_style_info" | wc -l)
       fi
       if [ $example_code -ne 0 ];then
-        echo "- Check example code failed! Please check the log and fix problems."
+        echo "*****Example code error***** Please fix the error listed in the information:"
+        echo "$example_info" | grep "API check -- Example Code" -A $(echo "$example_info" | wc -l)
       fi
       [ $check_style_code -ne 0 ] && exit $check_style_code
       [ $example_code -ne 0 ] && exit $example_code
@@ -1430,15 +1496,16 @@ function main() {
         ;;
       build_and_check)
         set +e
-        $(check_style >&2)
+        check_style_info=$(check_style)
         check_style_code=$?
         generate_upstream_develop_api_spec ${PYTHON_ABI:-""} ${parallel_number}
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
         check_sequence_op_unittest
         generate_api_spec ${PYTHON_ABI:-""} "PR"
-        $(example >&2)
+        set +e
+        example_info=$(example)
         example_code=$?
-        summary_check_problems $check_style_code $example_code
+        summary_check_problems $check_style_code $example_code "$check_style_info" "$example_info"
         assert_api_spec_approvals
         ;;
       build)
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index d5793eb424ab794e3e8af8ef2312aac927c272e5..016726633ea355ed20149e94833ca7e1657c3f7d 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -49,6 +49,7 @@ import paddle.optimizer
 import paddle.metric
 import paddle.device
 import paddle.incubate.complex as complex
+import paddle.regularizer
 
 # TODO: define alias in tensor and framework directory
 
@@ -75,6 +76,7 @@ from .tensor.creation import full_like  #DEFINE_ALIAS
 from .tensor.creation import triu  #DEFINE_ALIAS
 from .tensor.creation import tril  #DEFINE_ALIAS
 from .tensor.creation import meshgrid  #DEFINE_ALIAS
+from .tensor.creation import empty  #DEFINE_ALIAS
 from .tensor.linalg import matmul  #DEFINE_ALIAS
 from .tensor.linalg import dot  #DEFINE_ALIAS
 # from .tensor.linalg import einsum        #DEFINE_ALIAS
diff --git a/python/paddle/dataset/uci_housing.py b/python/paddle/dataset/uci_housing.py
index 5bc9c1444d2b34f057cd92782eb50e5fc23916eb..f7930d34f93e21bf3f832da828fb0036742b5091 100644
--- a/python/paddle/dataset/uci_housing.py
+++ b/python/paddle/dataset/uci_housing.py
@@ -74,7 +74,8 @@ def load_data(filename, feature_num=14, ratio=0.8):
     data = data.reshape(data.shape[0] // feature_num, feature_num)
     maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
         axis=0) / data.shape[0]
-    feature_range(maximums[:-1], minimums[:-1])
+    # if you want to print the distribution of input data, you could use function of feature_range
+    #feature_range(maximums[:-1], minimums[:-1])
     for i in six.moves.range(feature_num - 1):
         data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
     offset = int(data.shape[0] * ratio)
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index b7357eef7ad9a3abae7f9c1c09fdc00b409061ad..27c82227316309b370aefe5e0550230c3f703c8c 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -21,6 +21,7 @@ from .parallel import get_rank
 from .parallel import get_world_size
 from paddle.fluid.dygraph.parallel import prepare_context  #DEFINE_ALIAS
 from paddle.fluid.dygraph.parallel import ParallelEnv  #DEFINE_ALIAS
+from paddle.distributed.fleet.dataset import *
 
 from . import collective
 from .collective import *
@@ -30,11 +31,8 @@ __all__ = ["spawn"]
 
 # dygraph parallel apis
 __all__ += [
-    "init_parallel_env",
-    "get_rank",
-    "get_world_size",
-    "prepare_context",
-    "ParallelEnv",
+    "init_parallel_env", "get_rank", "get_world_size", "prepare_context",
+    "ParallelEnv", "InMemoryDataset", "QueueDataset"
 ]
 
 # collective apis
diff --git a/python/paddle/distributed/cloud_utils.py b/python/paddle/distributed/cloud_utils.py
index 345b783d60bb79e99c98c4e9d212aa11cbe91dcc..5b7268e4b64fe34e6376819a7ac5659d1a5f5959 100644
--- a/python/paddle/distributed/cloud_utils.py
+++ b/python/paddle/distributed/cloud_utils.py
@@ -19,7 +19,7 @@ from paddle.distributed.utils import get_cluster, logger
 
 def get_cloud_cluster(args_node_ips, args_node_ip, args_port, selected_gpus):
     """
-    args_node_ips, args_node_ip:string
+    args_node_ips:string, args_node_ip:string, args_port: int, selected_gpus:list
     """
     #you can automatically get ip info while using paddlecloud multi nodes mode.
     node_ips = os.getenv("PADDLE_TRAINERS")
@@ -31,6 +31,9 @@ def get_cloud_cluster(args_node_ips, args_node_ip, args_port, selected_gpus):
     node_rank = os.getenv("PADDLE_TRAINER_ID")
     assert node_rank is not None, "PADDLE_TRAINER_ID should not be None"
 
+    paddle_ports_num = int(os.getenv("TRAINER_PORTS_NUM"))
+    assert paddle_ports_num is not None, "TRAINER_PORTS_NUM should not be None"
+
     node_ips = node_ips.split(",")
     num_nodes = len(node_ips)
     node_rank = int(node_rank)
@@ -47,32 +50,47 @@ automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).\
 Your input cluster_node_ips: {} doesn't equals to IPs: {} from \
 paddlecloud environment.".format(args_node_ips, node_ips))
 
-    started_port = args_port
-    print("num_nodes:", num_nodes)
-    if num_nodes > 1:
-        try:
-            paddle_port = int(os.getenv("PADDLE_PORT", ""))
-            paddle_port_num = int(os.getenv("TRAINER_PORTS_NUM", ""))
-
-            if paddle_port_num >= len(
-                    selected_gpus) and paddle_port != args_port:
-                logger.warning("Use Cloud specified port:{}.".format(
-                    paddle_port))
-                started_port = paddle_port
-
-        except Exception as e:
-            print(e)
-            pass
-
-    if started_port is None:
-        started_port = 6170
-
-    logger.debug("parsed from args:node_ips:{} \
-        node_ip:{} node_rank:{} started_port:{}"
-                 .format(node_ips, node_ip, node_rank, started_port))
-
-    ports = [x for x in range(started_port, started_port + len(selected_gpus))]
-    cluster, pod = get_cluster(node_ips, node_ip, ports, selected_gpus)
+    # DISTRIBUTED_TRAINER_ENDPOINTS: new environment since paddlecloud 1.8.4
+    # e.g: DISTRIBUTED_TRAINER_ENDPOINTS="ip1:port1,ip1:port2,ip1:port3,ip1:port4,ip2:port5,ip2:port6,ip2:port7,ip2:port8"
+    trainer_endpoints = os.getenv("DISTRIBUTED_TRAINER_ENDPOINTS")
+    if trainer_endpoints is None:
+        started_port = args_port
+        if num_nodes > 1:
+            try:
+                paddle_port = int(os.getenv("PADDLE_PORT", ""))
+
+                if paddle_ports_num >= len(
+                        selected_gpus) and paddle_port != args_port:
+                    logger.warning("Use Cloud specified port:{}.".format(
+                        paddle_port))
+                    started_port = paddle_port
+
+            except Exception as e:
+                print(e)
+                pass
+
+        if started_port is None:
+            started_port = 6170
+        ports = [
+            x for x in range(started_port, started_port + len(selected_gpus))
+        ]
+        trainer_endpoints = []
+        for ip in node_ips:
+            trainer_endpoints.append(["%s:%d" % (ip, port) for port in ports])
+    else:
+        trainer_endpoints_ori = trainer_endpoints.split(",")
+        trainer_endpoints = []
+        assert num_nodes * paddle_ports_num == len(trainer_endpoints_ori)
+        for i in range(num_nodes):
+            trainer_endpoints.append(trainer_endpoints_ori[
+                i * paddle_ports_num:(i + 1) * paddle_ports_num])
+
+    logger.debug("parsed from args: node_ips:{} \
+        node_ip:{} node_rank:{} trainer_endpoints:{}"
+                 .format(node_ips, node_ip, node_rank, trainer_endpoints))
+
+    cluster, pod = get_cluster(node_ips, node_ip, trainer_endpoints,
+                               selected_gpus)
     return cluster, cluster.pods[node_rank]
 
 
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index 5f0cf9f93d62eba9b81e8a834b52f84122f2702d..2539fa57a34b1fe6fdea6b6b847d52f765df3fa3 100644
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -23,7 +23,6 @@ from .dataset import *
 __all__ = [
     "DistributedStrategy",
     "UtilBase",
-    "DatasetFactory",
     "UserDefinedRoleMaker",
     "PaddleCloudRoleMaker",
     "Fleet",
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 9c1793fd5b56eb728ae7d16840cf4fb09cf975c8..1b86056c00443be4170757cee3cc60bbafd0f40b 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -15,10 +15,26 @@
 import paddle
 from paddle.distributed.fleet.proto import distributed_strategy_pb2
 from paddle.fluid.framework import Variable, set_flags, core
+from paddle.fluid.wrapped_decorator import wrap_decorator
 import google.protobuf.text_format
+import google.protobuf
 
 __all__ = ["DistributedStrategy"]
 
+non_auto_func_called = True
+
+
+def __non_auto_func_called__(func):
+    def __impl__(*args, **kwargs):
+        global non_auto_func_called
+        non_auto_func_called = False
+        return func(*args, **kwargs)
+
+    return __impl__
+
+
+is_strict_auto = wrap_decorator(__non_auto_func_called__)
+
 
 def get_msg_dict(msg):
     res_dict = {}
@@ -164,6 +180,7 @@ class DistributedStrategy(object):
         return execution_strategy
 
     @execution_strategy.setter
+    @is_strict_auto
     def execution_strategy(self, strategy):
         fields = self.strategy.execution_strategy.DESCRIPTOR.fields
         for f in fields:
@@ -203,6 +220,7 @@ class DistributedStrategy(object):
         return build_strategy
 
     @build_strategy.setter
+    @is_strict_auto
     def build_strategy(self, strategy):
         fields = self.strategy.build_strategy.DESCRIPTOR.fields
         for f in fields:
@@ -237,6 +255,7 @@ class DistributedStrategy(object):
         return self.strategy.a_sync
 
     @a_sync.setter
+    @is_strict_auto
     def a_sync(self, flag):
         if isinstance(flag, bool):
             self.strategy.a_sync = flag
@@ -287,6 +306,7 @@ class DistributedStrategy(object):
         return get_msg_dict(self.strategy.a_sync_configs)
 
     @a_sync_configs.setter
+    @is_strict_auto
     def a_sync_configs(self, configs):
         check_configs_key(self.strategy.a_sync_configs, configs,
                           "a_sync_configs")
@@ -309,6 +329,7 @@ class DistributedStrategy(object):
         return self.strategy.amp
 
     @amp.setter
+    @is_strict_auto
     def amp(self, flag):
         if isinstance(flag, bool):
             self.strategy.amp = flag
@@ -351,6 +372,7 @@ class DistributedStrategy(object):
         return get_msg_dict(self.strategy.amp_configs)
 
     @amp_configs.setter
+    @is_strict_auto
     def amp_configs(self, configs):
         check_configs_key(self.strategy.amp_configs, configs, "amp_configs")
         assign_configs_value(self.strategy.amp_configs, configs)
@@ -388,6 +410,7 @@ class DistributedStrategy(object):
         return self.strategy.sync_nccl_allreduce
 
     @sync_nccl_allreduce.setter
+    @is_strict_auto
     def sync_nccl_allreduce(self, flag):
         if isinstance(flag, bool):
             self.strategy.sync_nccl_allreduce = flag
@@ -411,6 +434,7 @@ class DistributedStrategy(object):
         return self.strategy.use_hierarchical_allreduce
 
     @use_hierarchical_allreduce.setter
+    @is_strict_auto
     def use_hierarchical_allreduce(self, flag):
         if isinstance(flag, bool):
             self.strategy.use_hierarchical_allreduce = flag
@@ -435,6 +459,7 @@ class DistributedStrategy(object):
         return self.strategy.hierarchical_allreduce_inter_nranks
 
     @hierarchical_allreduce_inter_nranks.setter
+    @is_strict_auto
     def hierarchical_allreduce_inter_nranks(self, value):
         if isinstance(value, int):
             self.strategy.hierarchical_allreduce_inter_nranks = value
@@ -461,6 +486,7 @@ class DistributedStrategy(object):
         return self.strategy.sync_batch_norm
 
     @sync_batch_norm.setter
+    @is_strict_auto
     def sync_batch_norm(self, flag):
         if isinstance(flag, bool):
             self.strategy.sync_batch_norm = flag
@@ -483,6 +509,7 @@ class DistributedStrategy(object):
         return self.strategy.fuse_all_reduce_ops
 
     @fuse_all_reduce_ops.setter
+    @is_strict_auto
     def fuse_all_reduce_ops(self, flag):
         if isinstance(flag, bool):
             self.strategy.fuse_all_reduce_ops = flag
@@ -506,6 +533,7 @@ class DistributedStrategy(object):
         return self.strategy.fuse_grad_size_in_MB
 
     @fuse_grad_size_in_MB.setter
+    @is_strict_auto
     def fuse_grad_size_in_MB(self, value):
         if isinstance(value, int):
             self.strategy.fuse_grad_size_in_MB = value
@@ -517,6 +545,7 @@ class DistributedStrategy(object):
         return self.strategy.fuse_grad_size_in_TFLOPS
 
     @_fuse_grad_size_in_TFLOPS.setter
+    @is_strict_auto
     def _fuse_grad_size_in_TFLOPS(self, value):
         if isinstance(value, float):
             self.strategy.fuse_grad_size_in_TFLOPS = value
@@ -543,6 +572,7 @@ class DistributedStrategy(object):
         return self.strategy.nccl_comm_num
 
     @nccl_comm_num.setter
+    @is_strict_auto
     def nccl_comm_num(self, value):
         if isinstance(value, int):
             self.strategy.nccl_comm_num = value
@@ -550,6 +580,7 @@ class DistributedStrategy(object):
             print("WARNING: nccl_comm_num should have value of int type")
 
     @recompute.setter
+    @is_strict_auto
     def recompute(self, flag):
         if isinstance(flag, bool):
             self.strategy.recompute = flag
@@ -574,6 +605,7 @@ class DistributedStrategy(object):
         return get_msg_dict(self.strategy.recompute_configs)
 
     @recompute_configs.setter
+    @is_strict_auto
     def recompute_configs(self, configs):
         check_configs_key(self.strategy.recompute_configs, configs,
                           "checkpoint_configs")
@@ -598,6 +630,7 @@ class DistributedStrategy(object):
         return self.strategy.pipeline
 
     @pipeline.setter
+    @is_strict_auto
     def pipeline(self, flag):
         if isinstance(flag, bool):
             self.strategy.pipeline = flag
@@ -634,6 +667,7 @@ class DistributedStrategy(object):
         return get_msg_dict(self.strategy.pipeline_configs)
 
     @pipeline_configs.setter
+    @is_strict_auto
     def pipeline_configs(self, configs):
         check_configs_key(self.strategy.pipeline_configs, configs,
                           "pipeline_configs")
@@ -658,6 +692,7 @@ class DistributedStrategy(object):
         return self.strategy.localsgd
 
     @localsgd.setter
+    @is_strict_auto
     def localsgd(self, flag):
         if isinstance(flag, bool):
             self.strategy.localsgd = flag
@@ -672,11 +707,7 @@ class DistributedStrategy(object):
 
         **Notes**:
             k_steps(int) The local steps for training before parameter synchronization. Default 1.
-
-            If strategy.auto is set True, the local steps will be calculated automatically during training.
-            The algorithm is referenced in this paper: 
-            `Adaptive Communication Strategies to Achieve the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_.
-            In this case, k_steps indicates the first local steps which is suggested setting to 1.
+            begin_step(int) The step of begining training by localsgd. Default 1.
 
         Examples:
           .. code-block:: python
@@ -684,12 +715,14 @@ class DistributedStrategy(object):
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.localsgd = True
-            strategy.localsgd_configs = {"k_steps": 4}
+            strategy.localsgd_configs = {"k_steps": 4,
+                                         "begin_step": 30}
         """
 
         return get_msg_dict(self.strategy.localsgd_configs)
 
     @localsgd_configs.setter
+    @is_strict_auto
     def localsgd_configs(self, configs):
         check_configs_key(self.strategy.localsgd_configs, configs,
                           "localsgd_configs")
@@ -714,6 +747,7 @@ class DistributedStrategy(object):
         return self.strategy.dgc
 
     @dgc.setter
+    @is_strict_auto
     def dgc(self, flag):
         if isinstance(flag, bool):
             self.strategy.dgc = flag
@@ -749,6 +783,7 @@ class DistributedStrategy(object):
         return get_msg_dict(self.strategy.dgc_configs)
 
     @dgc_configs.setter
+    @is_strict_auto
     def dgc_configs(self, configs):
         check_configs_key(self.strategy.dgc_configs, configs, "dgc_configs")
         assign_configs_value(self.strategy.dgc_configs, configs)
@@ -776,6 +811,7 @@ class DistributedStrategy(object):
         return self.strategy.gradient_merge
 
     @gradient_merge.setter
+    @is_strict_auto
     def gradient_merge(self, flag):
         if isinstance(flag, bool):
             self.strategy.gradient_merge = flag
@@ -803,6 +839,7 @@ class DistributedStrategy(object):
         return get_msg_dict(self.strategy.gradient_merge_configs)
 
     @gradient_merge_configs.setter
+    @is_strict_auto
     def gradient_merge_configs(self, configs):
         check_configs_key(self.strategy.gradient_merge_configs, configs,
                           "gradient_configs")
@@ -827,6 +864,7 @@ class DistributedStrategy(object):
         return self.strategy.lars
 
     @lars.setter
+    @is_strict_auto
     def lars(self, flag):
         if isinstance(flag, bool):
             self.strategy.lars = flag
@@ -862,6 +900,7 @@ class DistributedStrategy(object):
         return get_msg_dict(self.strategy.lars_configs)
 
     @lars_configs.setter
+    @is_strict_auto
     def lars_configs(self, configs):
         check_configs_key(self.strategy.lars_configs, configs, "lars_configs")
         assign_configs_value(self.strategy.lars_configs, configs)
@@ -887,6 +926,7 @@ class DistributedStrategy(object):
         return self.strategy.lamb
 
     @lamb.setter
+    @is_strict_auto
     def lamb(self, flag):
         if isinstance(flag, bool):
             self.strategy.lamb = flag
@@ -917,15 +957,21 @@ class DistributedStrategy(object):
         return get_msg_dict(self.strategy.lamb_configs)
 
     @lamb_configs.setter
+    @is_strict_auto
     def lamb_configs(self, configs):
         check_configs_key(self.strategy.lamb_configs, configs, "lamb_configs")
         assign_configs_value(self.strategy.lamb_configs, configs)
 
     @property
     def elastic(self):
+        """
+        Indicating whether we want to do current distributed training on clusters with elastic resources.
+        Currently, this is configuration is not valid.
+        """
         return self.strategy.elastic
 
     @elastic.setter
+    @is_strict_auto
     def elastic(self, flag):
         if isinstance(flag, bool):
             self.strategy.elastic = flag
@@ -934,6 +980,25 @@ class DistributedStrategy(object):
 
     @property
     def auto(self):
+        """
+        Indicating whether we are using auto-parallel configuration
+        This feature is currently an experimental feature. Currently, 
+        auto-parallelism can be used only when a user does not set any other
+        strategy configs except auto. For details, please reference the following
+        code example
+        Default Value: False
+
+        Examples:
+          .. code-block:: python
+
+            import paddle
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.auto = True
+
+            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+            optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        """
         return self.strategy.auto
 
     @auto.setter
@@ -945,9 +1010,27 @@ class DistributedStrategy(object):
 
     @property
     def cudnn_exhaustive_search(self):
+        """
+        Indicating whether to use exhaustive search method to choose convolution algorithms.
+        Exhaustive search attempts all cuDNN algorithms to choose the fastest algorithm.
+        This method is time-consuming, the choosed algorithm will be cached for the given layer specifications.
+        Once the layer specifications (like batch size, feature map size) are changed, it will search again.
+        Default Value: True
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.cudnn_exhaustive_search = False
+
+            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+            optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        """
         return self.strategy.cudnn_exhaustive_search
 
     @cudnn_exhaustive_search.setter
+    @is_strict_auto
     def cudnn_exhaustive_search(self, flag):
         if isinstance(flag, bool):
             self.strategy.cudnn_exhaustive_search = flag
@@ -958,9 +1041,28 @@ class DistributedStrategy(object):
 
     @property
     def conv_workspace_size_limit(self):
+        """
+        The workspace limit size in MB unit for choosing cuDNN convolution algorithms.
+        The inner funciton of cuDNN obtain the fastest suited algorithm that fits within this memory limit.
+        Usually, large workspace size may lead to choose faster algorithms,
+        but significant increasing memory workspace. Users need to trade-off between memory and speed.
+        Default Value: 4000
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.conv_workspace_size_limit = 1024
+
+            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+            optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        
+        """
         return self.strategy.conv_workspace_size_limit
 
     @conv_workspace_size_limit.setter
+    @is_strict_auto
     def conv_workspace_size_limit(self, value):
         if isinstance(value, int):
             self.strategy.conv_workspace_size_limit = value
@@ -971,9 +1073,26 @@ class DistributedStrategy(object):
 
     @property
     def cudnn_batchnorm_spatial_persistent(self):
+        """
+        Indicates whether to use the mode CUDNN_BATCHNORM_SPATIAL_PERSISTENT function in batchnorm.
+        This is only useful in cudnn.
+        Default Value: True
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.cudnn_batchnorm_spatial_persistent = True
+
+            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+            optimizer = fleet.distributed_optimizer(optimizer, strategy)
+
+        """
         return self.strategy.cudnn_batchnorm_spatial_persistent
 
     @cudnn_batchnorm_spatial_persistent.setter
+    @is_strict_auto
     def cudnn_batchnorm_spatial_persistent(self, flag):
         if isinstance(flag, bool):
             self.strategy.cudnn_batchnorm_spatial_persistent = flag
@@ -1005,8 +1124,98 @@ class DistributedStrategy(object):
             if core.globals().is_public(key):
                 core.globals()[key] = values[i]
 
+    def _is_strict_auto(self):
+        global non_auto_func_called
+        if self.strategy.auto and non_auto_func_called:
+            return True
+        return False
+
     def __repr__(self):
+        spacing = 2
+        max_k = 38
+        max_v = 38
+
+        length = max_k + max_v + spacing
+
+        h1_format = "    " + "|{{:^{}s}}|\n".format(length)
+        h2_format = "    " + "|{{:>{}s}}{}{{:^{}s}}|\n".format(max_k, " " *
+                                                               spacing, max_v)
+
+        border = "    +" + "".join(["="] * length) + "+"
+        line = "    +" + "".join(["-"] * length) + "+"
+
+        draws = border + "\n"
+        draws += h1_format.format("")
+        draws += h1_format.format("DistributedStrategy Overview")
+        draws += h1_format.format("")
+
         fields = self.strategy.DESCRIPTOR.fields
+        str_res = ""
+
+        env_draws = line + "\n"
         for f in fields:
-            print("{}: {}".format(f.name, f.default_value))
-        return str(self.strategy)
+            if "build_strategy" in f.name or "execution_strategy" in f.name:
+                continue
+            if "_configs" in f.name:
+                continue
+            else:
+                if isinstance(getattr(self.strategy, f.name), bool):
+                    if hasattr(self.strategy, f.name + "_configs"):
+                        if getattr(self.strategy, f.name):
+                            draws += border + "\n"
+                            draws += h1_format.format(
+                                "{} = True, please check {}_configs".format(
+                                    f.name, f.name))
+                            draws += line + "\n"
+                            my_configs = getattr(self.strategy,
+                                                 f.name + "_configs")
+                            config_fields = my_configs.DESCRIPTOR.fields
+                            for ff in config_fields:
+                                if isinstance(
+                                        getattr(my_configs, ff.name),
+                                        google.protobuf.pyext._message.
+                                        RepeatedScalarContainer):
+                                    values = getattr(my_configs, ff.name)
+                                    for i, v in enumerate(values):
+                                        if i == 0:
+                                            draws += h2_format.format(ff.name,
+                                                                      str(v))
+                                        else:
+                                            draws += h2_format.format("",
+                                                                      str(v))
+                                else:
+                                    draws += h2_format.format(
+                                        ff.name,
+                                        str(getattr(my_configs, ff.name)))
+                    else:
+                        env_draws += h2_format.format(
+                            f.name, str(getattr(self.strategy, f.name)))
+                else:
+                    env_draws += h2_format.format(
+                        f.name, str(getattr(self.strategy, f.name)))
+
+        result_res = draws + border + "\n" + h1_format.format(
+            "Environment Flags, Communication Flags")
+        result_res += env_draws
+
+        build_strategy_str = border + "\n"
+        build_strategy_str += h1_format.format("Build Strategy")
+        build_strategy_str += line + "\n"
+
+        fields = self.strategy.build_strategy.DESCRIPTOR.fields
+        for f in fields:
+            build_strategy_str += h2_format.format(
+                f.name, str(getattr(self.strategy.build_strategy, f.name)))
+        build_strategy_str += border + "\n"
+
+        execution_strategy_str = h1_format.format("Execution Strategy")
+        execution_strategy_str += line + "\n"
+
+        fields = self.strategy.execution_strategy.DESCRIPTOR.fields
+        for f in fields:
+            execution_strategy_str += h2_format.format(
+                f.name, str(getattr(self.strategy.execution_strategy, f.name)))
+        execution_strategy_str += border + "\n"
+
+        result_res += build_strategy_str + execution_strategy_str
+        return result_res
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 8c748060e630079af362759b1e4c1c0b09d58063..0dfcd5f3255efa945bbd4ac94b00433960eeaa22 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
+import copy
 import warnings
 import paddle
 from paddle.fluid.framework import dygraph_only
@@ -230,7 +231,7 @@ class Fleet(object):
 
         Returns:
             int: worker numbers
-        
+
         Examples:
             .. code-block:: python
 
@@ -736,7 +737,7 @@ class Fleet(object):
         """
         Set the value of the learning rate manually in the optimizer. 
         Only work in dygraph mode
- 
+
         Args:
             value (float|Tensor): the value of learning rate
 
@@ -876,7 +877,7 @@ class Fleet(object):
         """
         Execute the optimizer once.
         Only work in dygraph mode
- 
+
         Returns: None
 
         Examples:
@@ -1008,6 +1009,18 @@ class Fleet(object):
             MetaOptimizerFactory()._get_valid_meta_optimizers(
                 self.user_defined_optimizer)
 
+        context["user_defined_strategy"] = copy.copy(self.user_defined_strategy)
+
+        # trigger the auto-parallel in very strict condition
+        # strategy = DistributedStrategy()
+        # strategy.auto = True
+        # optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+        # optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        if self.user_defined_strategy._is_strict_auto():
+            # turn on all the strategy for each optimizer
+            for opt in distributed_optimizer_list:
+                opt._enable_strategy(self.user_defined_strategy, context)
+
         valid_optimizer_list = []
         valid_graph_optimizer_list = []
         can_not_apply_optimizer_list = []
diff --git a/python/paddle/distributed/fleet/base/strategy_compiler.py b/python/paddle/distributed/fleet/base/strategy_compiler.py
index 4097fc1237f8d7616101810f994c243dffb2cd67..29e10661888f8a7fd6e3c40ee356aad326c193a9 100644
--- a/python/paddle/distributed/fleet/base/strategy_compiler.py
+++ b/python/paddle/distributed/fleet/base/strategy_compiler.py
@@ -60,7 +60,7 @@ class StrategyCompiler(StrategyCompilerBase):
 
     def _get_valid_strategy(self, dist_strategy, can_not_apply_optimizer_list):
         import copy
-        valid_strategy = copy.copy(dist_strategy)
+        valid_strategy = copy.deepcopy(dist_strategy)
         invalid_optimizers = []
         for candidate in self._meta_optimizer_candidates:
             is_valid = False
diff --git a/python/paddle/distributed/fleet/cloud_utils.py b/python/paddle/distributed/fleet/cloud_utils.py
index 49d66118d902e43f7ee0c4003c516081092b2a97..a1203bed85cadd859132ad67159b604c7b78916b 100644
--- a/python/paddle/distributed/fleet/cloud_utils.py
+++ b/python/paddle/distributed/fleet/cloud_utils.py
@@ -19,7 +19,7 @@ from paddle.distributed.fleet.launch_utils import get_cluster, logger
 
 def get_cloud_cluster(args_node_ips, selected_gpus, args_port=6170):
     """
-    args_node_ips, args_node_ip:string
+    args_node_ips:string, selected_gpus:list, args_port: int
     """
     #you can automatically get ip info while using paddlecloud multi nodes mode.
     node_ips = os.getenv("PADDLE_TRAINERS")
@@ -31,6 +31,9 @@ def get_cloud_cluster(args_node_ips, selected_gpus, args_port=6170):
     node_rank = os.getenv("PADDLE_TRAINER_ID")
     assert node_rank is not None, "PADDLE_TRAINER_ID should not be None"
 
+    paddle_ports_num = int(os.getenv("TRAINER_PORTS_NUM"))
+    assert paddle_ports_num is not None, "TRAINER_PORTS_NUM should not be None"
+
     node_ips = node_ips.split(",")
     num_nodes = len(node_ips)
     node_rank = int(node_rank)
@@ -42,32 +45,47 @@ automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).\
 Your input cluster_node_ips: {} doesn't equals to IPs: {} from \
 paddlecloud environment.".format(args_node_ips, node_ips))
 
-    started_port = args_port
-    print("num_nodes:", num_nodes)
-    if num_nodes > 1:
-        try:
-            paddle_port = int(os.getenv("PADDLE_PORT", ""))
-            paddle_port_num = int(os.getenv("TRAINER_PORTS_NUM", ""))
-
-            if paddle_port_num >= len(
-                    selected_gpus) and paddle_port != args_port:
-                logger.warning("Use Cloud specified port:{}.".format(
-                    paddle_port))
-                started_port = paddle_port
-
-        except Exception as e:
-            print(e)
-            pass
-
-    if started_port is None:
-        started_port = 6170
-
-    logger.debug("parsed from args:node_ips:{} \
-        node_ip:{} node_rank:{} started_port:{}"
-                 .format(node_ips, node_ip, node_rank, started_port))
-
-    ports = [x for x in range(started_port, started_port + len(selected_gpus))]
-    cluster, pod = get_cluster(node_ips, node_ip, ports, selected_gpus)
+    # DISTRIBUTED_TRAINER_ENDPOINTS: new environment since paddlecloud 1.8.4
+    # e.g: DISTRIBUTED_TRAINER_ENDPOINTS="ip1:port1,ip1:port2,ip1:port3,ip1:port4,ip2:port5,ip2:port6,ip2:port7,ip2:port8"
+    trainer_endpoints = os.getenv("DISTRIBUTED_TRAINER_ENDPOINTS")
+    if trainer_endpoints is None:
+        started_port = args_port
+        if num_nodes > 1:
+            try:
+                paddle_port = int(os.getenv("PADDLE_PORT", ""))
+
+                if paddle_ports_num >= len(
+                        selected_gpus) and paddle_port != args_port:
+                    logger.warning("Use Cloud specified port:{}.".format(
+                        paddle_port))
+                    started_port = paddle_port
+
+            except Exception as e:
+                print(e)
+                pass
+
+        if started_port is None:
+            started_port = 6170
+        ports = [
+            x for x in range(started_port, started_port + len(selected_gpus))
+        ]
+        trainer_endpoints = []
+        for ip in node_ips:
+            trainer_endpoints.append(["%s:%d" % (ip, port) for port in ports])
+    else:
+        trainer_endpoints_ori = trainer_endpoints.split(",")
+        trainer_endpoints = []
+        assert num_nodes * paddle_ports_num == len(trainer_endpoints_ori)
+        for i in range(num_nodes):
+            trainer_endpoints.append(trainer_endpoints_ori[
+                i * paddle_ports_num:(i + 1) * paddle_ports_num])
+
+    logger.debug("parsed from args: node_ips:{} \
+        node_ip:{} node_rank:{} trainer_endpoints:{}"
+                 .format(node_ips, node_ip, node_rank, trainer_endpoints))
+
+    cluster, pod = get_cluster(node_ips, node_ip, trainer_endpoints,
+                               selected_gpus)
     return cluster, cluster.pods[node_rank]
 
 
@@ -75,7 +93,8 @@ def use_paddlecloud():
     node_ips = os.getenv("PADDLE_TRAINERS")
     node_ip = os.getenv("POD_IP")
     node_rank = os.getenv("PADDLE_TRAINER_ID")
-    if node_ips is None or node_ip is None or node_rank is None:
+    paddle_ports_num = os.getenv("TRAINER_PORTS_NUM")
+    if node_ips is None or node_ip is None or node_rank is None or paddle_ports_num is None:
         return False
     else:
         return True
diff --git a/python/paddle/distributed/fleet/dataset/dataset.py b/python/paddle/distributed/fleet/dataset/dataset.py
index f6504cacd9680806a13b4bb815247124b7e6a23c..5bd971181ed34e53ec90a31eb7371071372d443a 100644
--- a/python/paddle/distributed/fleet/dataset/dataset.py
+++ b/python/paddle/distributed/fleet/dataset/dataset.py
@@ -14,54 +14,11 @@
 """This is definition of dataset class, which is high performance IO."""
 
 import paddle
-import paddle.fluid as fluid
 from paddle.fluid.proto import data_feed_pb2
 from google.protobuf import text_format
 import paddle.fluid.core as core
 
 
-class DatasetFactory(object):
-    """
-    DatasetFactory is a factory which create dataset by its name,
-    you can create "QueueDataset" or "InMemoryDataset", or "FileInstantDataset",
-    the default is "QueueDataset".
-
-    Example:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-
-    """
-
-    def __init__(self):
-        """ Init. """
-        pass
-
-    def create_dataset(self, datafeed_class="QueueDataset"):
-        """
-        Create "QueueDataset" or "InMemoryDataset", or "FileInstantDataset",
-        the default is "QueueDataset".
-
-        Args:
-            datafeed_class(str): datafeed class name, QueueDataset or InMemoryDataset.
-                                 Default is QueueDataset.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-
-        """
-        try:
-            dataset = globals()[datafeed_class]()
-            return dataset
-        except:
-            raise ValueError("datafeed class %s does not exist" %
-                             datafeed_class)
-
-
 class DatasetBase(object):
     """ Base dataset class. """
 
@@ -75,96 +32,67 @@ class DatasetBase(object):
         self.thread_num = 1
         self.filelist = []
 
-    def set_pipe_command(self, pipe_command):
+    def init(self,
+             batch_size=1,
+             thread_num=1,
+             use_var=[],
+             pipe_command="cat",
+             input_type=0,
+             fs_name="",
+             fs_ugi="",
+             download_cmd="cat"):
         """
-        Set pipe command of current dataset
-        A pipe command is a UNIX pipeline command that can be used only
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_pipe_command("python my_script.py")
+        should be called only once in user's python scripts to initialize setings of dataset instance. 
+        Normally, it is called by InMemoryDataset or QueueDataset.
 
         Args:
-            pipe_command(str): pipe command
+            batch_size(int): batch size. It will be effective during training. default is 1.
+            thread_num(int): thread num, it is the num of readers. default is 1.
+            use_var(list): list of variables. Variables which you will use. default is [].
+            pipe_command(str): pipe command of current dataset. A pipe command is a UNIX pipeline command that can be used only. default is "cat"
+            input_type(int): the input type of generated input. 0 is for one sample, 1 is for one batch. defalut is 0.
+            fs_name(str): fs name. default is "".
+            fs_ugi(str): fs ugi. default is "".
+            download_cmd(str): customized download command. default is "cat"
 
-        """
-        self.proto_desc.pipe_command = pipe_command
 
-    def set_rank_offset(self, rank_offset):
         """
-        Set rank_offset for merge_pv. It set the message of Pv.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_rank_offset("rank_offset")
-
-        Args:
-            rank_offset(str): rank_offset's name
+        self._set_batch_size(batch_size)
+        self._set_thread(thread_num)
+        self._set_use_var(use_var)
+        self._set_pipe_command(pipe_command)
+        self._set_input_type(input_type)
+        self._set_hdfs_config(fs_name, fs_ugi)
+        self._set_download_cmd(download_cmd)
 
+    def _set_pipe_command(self, pipe_command):
         """
-        self.proto_desc.rank_offset = rank_offset
+        Set pipe command of current dataset
+        A pipe command is a UNIX pipeline command that can be used only
 
-    def set_fea_eval(self, record_candidate_size, fea_eval=True):
-        """
-        set fea eval mode for slots shuffle to debug the importance level of
-        slots(features), fea_eval need to be set True for slots shuffle.
-        
-        Args:
-            record_candidate_size(int): size of instances candidate to shuffle 
-                                        one slot
-            fea_eval(bool): whether enable fea eval mode to enable slots shuffle.
-                            default is True.
-            
         Examples:
             .. code-block:: python
 
-            import paddle.fluid as fluid
-            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-            dataset.set_fea_eval(1000000, True)
+              import paddle
+              dataset = paddle.distributed.fleet.dataset.DatasetBase()
+              dataset._set_pipe_command("python my_script.py")
 
-        """
-        if fea_eval:
-            self.dataset.set_fea_eval(fea_eval, record_candidate_size)
-        self.fea_eval = fea_eval
-
-    def slots_shuffle(self, slots):
-        """
-        Slots Shuffle 
-        Slots Shuffle is a shuffle method in slots level, which is usually used 
-        in sparse feature with large scale of instances. To compare the metric, i.e.
-        auc while doing slots shuffle on one or several slots with baseline to 
-        evaluate the importance level of slots(features).
-        
         Args:
-            slots(list[string]): the set of slots(string) to do slots shuffle.
+            pipe_command(str): pipe command
 
-        Examples:
-            import paddle.fluid as fluid
-            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-            dataset.set_merge_by_lineid()
-            #suppose there is a slot 0
-            dataset.slots_shuffle(['0'])
         """
-        if self.fea_eval:
-            slots_set = set(slots)
-            self.dataset.slots_shuffle(slots_set)
+        self.proto_desc.pipe_command = pipe_command
 
-    def set_batch_size(self, batch_size):
+    def _set_batch_size(self, batch_size):
         """
         Set batch size. Will be effective during training
 
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_batch_size(128)
+              import paddle
+              dataset = paddle.distributed.fleet.DatasetBase()
+              dataset._set_batch_size(128)
 
         Args:
             batch_size(int): batch size
@@ -172,32 +100,16 @@ class DatasetBase(object):
         """
         self.proto_desc.batch_size = batch_size
 
-    def set_pv_batch_size(self, pv_batch_size):
-        """
-        Set pv batch size. It will be effective during enable_pv_merge
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_pv_batch(128)
-        Args:
-            pv_batch_size(int): pv batch size
-
-        """
-        self.proto_desc.pv_batch_size = pv_batch_size
-
-    def set_thread(self, thread_num):
+    def _set_thread(self, thread_num):
         """
         Set thread num, it is the num of readers.
 
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-               dataset.set_thread(12)
+              import paddle
+              dataset = paddle.distributed.fleet.DatasetBase()
+              dataset._set_thread(12)
 
         Args:
             thread_num(int): thread num
@@ -212,8 +124,8 @@ class DatasetBase(object):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
+              import paddle
+              dataset = paddle.distributed.fleet.DatasetBase()
               dataset.set_filelist(['a.txt', 'b.txt'])
 
         Args:
@@ -222,19 +134,19 @@ class DatasetBase(object):
         self.dataset.set_filelist(filelist)
         self.filelist = filelist
 
-    def set_input_type(self, input_type):
+    def _set_input_type(self, input_type):
         self.proto_desc.input_type = input_type
 
-    def set_use_var(self, var_list):
+    def _set_use_var(self, var_list):
         """
         Set Variables which you will use.
 
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_use_var([data, label])
+              import paddle
+              dataset = paddle.distributed.fleet.DatasetBase()
+              dataset._set_use_var([data, label])
 
         Args:
             var_list(list): variable list
@@ -253,19 +165,19 @@ class DatasetBase(object):
                 slot_var.type = "uint64"
             else:
                 raise ValueError(
-                    "Currently, fluid.dataset only supports dtype=float32 and dtype=int64"
+                    "Currently, paddle.distributed.fleet.dataset only supports dtype=float32 and dtype=int64"
                 )
 
-    def set_hdfs_config(self, fs_name, fs_ugi):
+    def _set_hdfs_config(self, fs_name, fs_ugi):
         """
         Set hdfs config: fs name ad ugi
 
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_hdfs_config("my_fs_name", "my_fs_ugi")
+              import paddle
+              dataset = paddle.distributed.fleet.DatasetBase()
+              dataset._set_hdfs_config("my_fs_name", "my_fs_ugi")
 
         Args:
             fs_name(str): fs name
@@ -273,16 +185,16 @@ class DatasetBase(object):
         """
         self.dataset.set_hdfs_config(fs_name, fs_ugi)
 
-    def set_download_cmd(self, download_cmd):
+    def _set_download_cmd(self, download_cmd):
         """
         Set customized download cmd: download_cmd
 
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_download_cmd("./read_from_afs")
+              import paddle
+              dataset = paddle.distributed.fleet.DatasetBase()
+              dataset._set_download_cmd("./read_from_afs")
 
         Args:
             download_cmd(str): customized download command
@@ -297,22 +209,22 @@ class DatasetBase(object):
         if self.thread_num > len(self.filelist):
             self.thread_num = len(self.filelist)
         self.dataset.set_thread_num(self.thread_num)
-        self.dataset.set_data_feed_desc(self.desc())
+        self.dataset.set_data_feed_desc(self._desc())
         self.dataset.create_readers()
 
     def _finish_to_run(self):
         self.dataset.destroy_readers()
 
-    def desc(self):
+    def _desc(self):
         """
         Returns a protobuf message for this DataFeedDesc
 
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              print(dataset.desc())
+              import paddle
+              dataset = paddle.distributed.fleet.DatasetBase()
+              print(dataset._desc())
 
         Returns:
             A string message
@@ -330,10 +242,10 @@ class InMemoryDataset(DatasetBase):
     """
     InMemoryDataset, it will load data into memory
     and shuffle data before training.
-    This class should be created by DatasetFactory
 
     Example:
-        dataset = paddle.fluid.DatasetFactory().create_dataset("InMemoryDataset")
+        import paddle
+        dataset = paddle.distributed.InMemoryDataset()
     """
 
     def __init__(self):
@@ -351,7 +263,229 @@ class InMemoryDataset(DatasetBase):
         self.merge_by_lineid = False
         self.fleet_send_sleep_seconds = None
 
-    def set_feed_type(self, data_feed_type):
+    def _init_distributed_settings(self, **kwargs):
+        """
+        should be called only once in user's python scripts to initialize distributed-related setings of dataset instance
+        Args:
+            kwargs: Keyword arguments. Currently, we support following keys in **kwargs:
+
+            merge_size(int): ins size to merge, if merge_size > 0, set merge by line id, 
+                             instances of same line id will be merged after shuffle, 
+                             you should parse line id in data generator. default is -1.
+            parse_ins_id(bool): Set if Dataset need to parse ins_id. default is False.
+            parse_content(bool): Set if Dataset need to parse content. default is False.
+            fleet_send_batch_size(int): Set fleet send batch size in one rpc, default is 1024
+            fleet_send_sleep_seconds(int): Set fleet send sleep time, default is 0
+            fea_eval(bool): Set if Dataset need to do feature importance evaluation using slots shuffle.
+                            default is False.
+            candidate_size(int): if fea_eval is set True, set the candidate size used in slots shuffle.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
+              dataset.init(
+                    batch_size=1,
+                    thread_num=2,
+                    input_type=1,
+                    pipe_command="cat",
+                    use_var=[])
+              dataset._init_distributed_settings(
+                    parse_ins_id=True,
+                    parse_content=True,
+                    fea_eval=True,
+                    candidate_size=10000)
+              
+        """
+        merge_size = kwargs.get("merge_size", -1)
+        if merge_size > 0:
+            self._set_merge_by_lineid(merge_size)
+
+        parse_ins_id = kwargs.get("parse_ins_id", False)
+        self._set_parse_ins_id(parse_ins_id)
+
+        parse_content = kwargs.get("parse_content", False)
+        self._set_parse_content(parse_content)
+
+        fleet_send_batch_size = kwargs.get("fleet_send_batch_size", None)
+        if fleet_send_batch_size:
+            self._set_fleet_send_batch_size(fleet_send_batch_size)
+
+        fleet_send_sleep_seconds = kwargs.get("fleet_send_sleep_seconds", None)
+        if fleet_send_sleep_seconds:
+            self._set_fleet_send_sleep_seconds(fleet_send_sleep_seconds)
+
+        fea_eval = kwargs.get("fea_eval", False)
+        if fea_eval:
+            candidate_size = kwargs.get("candidate_size", 10000)
+            self._set_fea_eval(candidate_size, True)
+
+    def update_settings(self, **kwargs):
+        """
+        should be called in user's python scripts to update setings of dataset instance
+        Args:
+            kwargs: Keyword arguments. Currently, we support following keys in **kwargs,
+                    including single node settings and advanced distributed related settings:
+
+            batch_size(int): batch size. It will be effective during training. default is 1.
+            thread_num(int): thread num, it is the num of readers. default is 1.
+            use_var(list): list of variables. Variables which you will use. default is [].
+            input_type(int): the input type of generated input. 0 is for one sample, 1 is for one batch. defalut is 0.
+            fs_name(str): fs name. default is "".
+            fs_ugi(str): fs ugi. default is "".
+            pipe_command(str): pipe command of current dataset. A pipe command is a UNIX pipeline command that can be used only. default is "cat"
+            download_cmd(str): customized download command. default is "cat"
+            data_feed_type(str): data feed type used in c++ code. default is "MultiSlotInMemoryDataFeed".
+            queue_num(int): Dataset output queue num, training threads get data from queues. default is-1, which is set same as thread number in c++.
+
+            merge_size(int): ins size to merge, if merge_size > 0, set merge by line id, 
+                             instances of same line id will be merged after shuffle, 
+                             you should parse line id in data generator. default is -1.
+            parse_ins_id(bool): Set if Dataset need to parse ins_id. default is False.
+            parse_content(bool): Set if Dataset need to parse content. default is False.
+            fleet_send_batch_size(int): Set fleet send batch size in one rpc, default is 1024
+            fleet_send_sleep_seconds(int): Set fleet send sleep time, default is 0
+            fea_eval(bool): Set if Dataset need to do feature importance evaluation using slots shuffle.
+                            default is False.
+            candidate_size(int): if fea_eval is set True, set the candidate size used in slots shuffle.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
+              dataset.init(
+                    batch_size=1,
+                    thread_num=2,
+                    input_type=1,
+                    pipe_command="cat",
+                    use_var=[])
+              dataset._init_distributed_settings(
+                    parse_ins_id=True,
+                    parse_content=True,
+                    fea_eval=True,
+                    candidate_size=10000)
+              dataset.update_settings(batch_size=2)
+            
+        """
+        for key in kwargs:
+            if key == "pipe_command":
+                self._set_pipe_command(kwargs[key])
+            elif key == "batch_size":
+                self._set_batch_size(kwargs[key])
+            elif key == "thread_num":
+                self._set_thread(kwargs[key])
+            elif key == "use_var":
+                self._set_use_var(kwargs[key])
+            elif key == "input_type":
+                self._set_input_type(kwargs[key])
+            elif key == "fs_name" and "fs_ugi" in kwargs:
+                self._set_hdfs_config(kwargs[key], kwargs["fs_ugi"])
+            elif key == "download_cmd":
+                self._set_download_cmd(kwargs[key])
+            elif key == "merge_size" and kwargs.get("merge_size", -1) > 0:
+                self._set_merge_by_lineid(kwargs[key])
+            elif key == "parse_ins_id":
+                self._set_parse_ins_id(kwargs[key])
+            elif key == "parse_content":
+                self._set_parse_content(kwargs[key])
+            elif key == "fleet_send_batch_size":
+                self._set_fleet_send_batch_size(kwargs[key])
+            elif key == "fleet_send_sleep_seconds":
+                self._set_fleet_send_sleep_seconds(kwargs[key])
+            elif key == "fea_eval" and kwargs[key] == True:
+                candidate_size = kwargs.get("candidate_size", 10000)
+                self._set_fea_eval(candidate_size, True)
+
+    def init(self, **kwargs):
+        """
+        should be called only once in user's python scripts to initialize setings of dataset instance
+        Args:
+            kwargs: Keyword arguments. Currently, we support following keys in **kwargs:
+            
+            batch_size(int): batch size. It will be effective during training. default is 1.
+            thread_num(int): thread num, it is the num of readers. default is 1.
+            use_var(list): list of variables. Variables which you will use. default is [].
+            input_type(int): the input type of generated input. 0 is for one sample, 1 is for one batch. defalut is 0.
+            fs_name(str): fs name. default is "".
+            fs_ugi(str): fs ugi. default is "".
+            pipe_command(str): pipe command of current dataset. A pipe command is a UNIX pipeline command that can be used only. default is "cat"
+            download_cmd(str): customized download command. default is "cat"
+            data_feed_type(str): data feed type used in c++ code. default is "MultiSlotInMemoryDataFeed".
+            queue_num(int): Dataset output queue num, training threads get data from queues. default is -1, which is set same as thread number in c++.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                with open("test_queue_dataset_run_a.txt", "w") as f:
+                    data = "2 1 2 2 5 4 2 2 7 2 1 3\n"
+                    data += "2 6 2 2 1 4 2 2 4 2 2 3\n"
+                    data += "2 5 2 2 9 9 2 2 7 2 1 3\n"
+                    data += "2 7 2 2 1 9 2 3 7 2 5 3\n"
+                    f.write(data)
+                with open("test_queue_dataset_run_b.txt", "w") as f:
+                    data = "2 1 2 2 5 4 2 2 7 2 1 3\n"
+                    data += "2 6 2 2 1 4 2 2 4 2 2 3\n"
+                    data += "2 5 2 2 9 9 2 2 7 2 1 3\n"
+                    data += "2 7 2 2 1 9 2 3 7 2 5 3\n"
+                    f.write(data)
+
+                slots = ["slot1", "slot2", "slot3", "slot4"]
+                slots_vars = []
+                for slot in slots:
+                    var = fluid.data(
+                        name=slot, shape=[None, 1], dtype="int64", lod_level=1)
+                    slots_vars.append(var)
+
+                dataset = paddle.distributed.InMemoryDataset()
+                dataset.init(
+                    batch_size=1,
+                    thread_num=2,
+                    input_type=1,
+                    pipe_command="cat",
+                    use_var=slots_vars)
+                dataset.set_filelist(
+                    ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
+                dataset.load_into_memory()
+
+                exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda(
+                ) else fluid.CUDAPlace(0))
+                exe.run(fluid.default_startup_program())
+                exe.train_from_dataset(fluid.default_main_program(),
+                                           dataset)
+                os.remove("./test_queue_dataset_run_a.txt")
+                os.remove("./test_queue_dataset_run_b.txt")
+        """
+        batch_size = kwargs.get("batch_size", 1)
+        thread_num = kwargs.get("thread_num", 1)
+        use_var = kwargs.get("use_var", [])
+        input_type = kwargs.get("input_type", 0)
+        fs_name = kwargs.get("fs_name", "")
+        fs_ugi = kwargs.get("fs_ugi", "")
+        pipe_command = kwargs.get("pipe_command", "cat")
+        download_cmd = kwargs.get("download_cmd", "cat")
+
+        super(InMemoryDataset, self).init(
+            batch_size=batch_size,
+            thread_num=thread_num,
+            use_var=use_var,
+            pipe_command=pipe_command,
+            input_type=input_type,
+            fs_name=fs_name,
+            fs_ugi=fs_ugi,
+            download_cmd=download_cmd)
+
+        data_feed_type = kwargs.get("data_feed_type",
+                                    "MultiSlotInMemoryDataFeed")
+        self._set_feed_type(data_feed_type)
+
+        if kwargs.get("queue_num", -1) > 0:
+            queue_num = kwargs.get("queue_num", -1)
+            self._set_queue_num(queue_num)
+
+    def _set_feed_type(self, data_feed_type):
         """
         Set data_feed_desc
         """
@@ -373,7 +507,7 @@ class InMemoryDataset(DatasetBase):
         self.dataset.set_parse_logkey(self.parse_logkey)
         self.dataset.set_merge_by_sid(self.merge_by_sid)
         self.dataset.set_enable_pv_merge(self.enable_pv_merge)
-        self.dataset.set_data_feed_desc(self.desc())
+        self.dataset.set_data_feed_desc(self._desc())
         self.dataset.create_channel()
         self.dataset.create_readers()
 
@@ -387,7 +521,7 @@ class InMemoryDataset(DatasetBase):
             self.dataset.dynamic_adjust_channel_num(self.thread_num, False)
         self.dataset.dynamic_adjust_readers_num(self.thread_num)
 
-    def set_queue_num(self, queue_num):
+    def _set_queue_num(self, queue_num):
         """
         Set Dataset output queue num, training threads get data from queues
 
@@ -397,17 +531,17 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_queue_num(12)
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
+              dataset._set_queue_num(12)
 
         """
         self.is_user_set_queue_num = True
         self.queue_num = queue_num
 
-    def set_parse_ins_id(self, parse_ins_id):
+    def _set_parse_ins_id(self, parse_ins_id):
         """
-        Set id Dataset need to parse insid
+        Set if Dataset need to parse insid
 
         Args:
             parse_ins_id(bool): if parse ins_id or not
@@ -415,14 +549,14 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_parse_ins_id(True)
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
+              dataset._set_parse_ins_id(True)
 
         """
         self.parse_ins_id = parse_ins_id
 
-    def set_parse_content(self, parse_content):
+    def _set_parse_content(self, parse_content):
         """
         Set if Dataset need to parse content
 
@@ -432,120 +566,14 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_parse_content(True)
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
+              dataset._set_parse_content(True)
 
         """
         self.parse_content = parse_content
 
-    def set_parse_logkey(self, parse_logkey):
-        """
-        Set if Dataset need to parse logkey
-
-        Args:
-            parse_content(bool): if parse logkey or not
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_parse_logkey(True)
-
-        """
-        self.parse_logkey = parse_logkey
-
-    def set_merge_by_sid(self, merge_by_sid):
-        """
-        Set if Dataset need to merge sid. If not, one ins means one Pv.
-
-        Args:
-            merge_by_sid(bool): if merge sid or not
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_merge_by_sid(True)
-
-        """
-        self.merge_by_sid = merge_by_sid
-
-    def set_enable_pv_merge(self, enable_pv_merge):
-        """
-        Set if Dataset need to merge pv.
-
-        Args:
-            enable_pv_merge(bool): if enable_pv_merge or not
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_enable_pv_merge(True)
-
-        """
-        self.enable_pv_merge = enable_pv_merge
-
-    def preprocess_instance(self):
-        """
-        Merge pv instance and convey it from input_channel to input_pv_channel. 
-        It will be effective when enable_pv_merge_ is True.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.preprocess_instance()
-
-        """
-        self.dataset.preprocess_instance()
-
-    def set_current_phase(self, current_phase):
-        """
-        Set current phase in train. It is useful for untest.
-        current_phase : 1 for join, 0 for update.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.set_current_phase(1)
-
-        """
-        self.dataset.set_current_phase(current_phase)
-
-    def postprocess_instance(self):
-        """
-        Divide pv instance and convey it to input_channel.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.preprocess_instance()
-              exe.train_from_dataset(dataset)
-              dataset.postprocess_instance()
-
-        """
-        self.dataset.postprocess_instance()
-
-    def set_fleet_send_batch_size(self, fleet_send_batch_size=1024):
+    def _set_fleet_send_batch_size(self, fleet_send_batch_size=1024):
         """
         Set fleet send batch size, default is 1024
 
@@ -555,14 +583,14 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_fleet_send_batch_size(800)
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
+              dataset._set_fleet_send_batch_size(800)
 
         """
         self.fleet_send_batch_size = fleet_send_batch_size
 
-    def set_fleet_send_sleep_seconds(self, fleet_send_sleep_seconds=0):
+    def _set_fleet_send_sleep_seconds(self, fleet_send_sleep_seconds=0):
         """
         Set fleet send sleep time, default is 0
 
@@ -572,14 +600,14 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_fleet_send_sleep_seconds(2)
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
+              dataset._set_fleet_send_sleep_seconds(2)
 
         """
         self.fleet_send_sleep_seconds = fleet_send_sleep_seconds
 
-    def set_merge_by_lineid(self, merge_size=2):
+    def _set_merge_by_lineid(self, merge_size=2):
         """
         Set merge by line id, instances of same line id will be merged after
         shuffle, you should parse line id in data generator.
@@ -590,22 +618,22 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_merge_by_lineid()
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
+              dataset._set_merge_by_lineid()
 
         """
         self.dataset.set_merge_by_lineid(merge_size)
         self.merge_by_lineid = True
         self.parse_ins_id = True
 
-    def set_generate_unique_feasigns(self, generate_uni_feasigns, shard_num):
+    def _set_generate_unique_feasigns(self, generate_uni_feasigns, shard_num):
         self.dataset.set_generate_unique_feasigns(generate_uni_feasigns)
         self.gen_uni_feasigns = generate_uni_feasigns
         self.local_shard_num = shard_num
 
-    def generate_local_tables_unlock(self, table_id, fea_dim, read_thread_num,
-                                     consume_thread_num, shard_num):
+    def _generate_local_tables_unlock(self, table_id, fea_dim, read_thread_num,
+                                      consume_thread_num, shard_num):
         self.dataset.generate_local_tables_unlock(
             table_id, fea_dim, read_thread_num, consume_thread_num, shard_num)
 
@@ -616,8 +644,8 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.load_into_memory()
@@ -635,8 +663,8 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.preload_into_memory()
@@ -656,8 +684,8 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.preload_into_memory()
@@ -673,8 +701,8 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.load_into_memory()
@@ -692,9 +720,9 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
+              import paddle
               from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset = paddle.distributed.InMemoryDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.load_into_memory()
@@ -736,9 +764,9 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
+              import paddle
               from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset = paddle.distributed.InMemoryDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.load_into_memory()
@@ -751,30 +779,6 @@ class InMemoryDataset(DatasetBase):
         """
         self.dataset.release_memory()
 
-    def get_pv_data_size(self):
-        """
-        Get memory data size of Pv, user can call this function to know the pv num
-        of ins in all workers after load into memory.
-
-        Note:
-            This function may cause bad performance, because it has barrier
-
-        Returns:
-            The size of memory pv data.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              print dataset.get_pv_data_size()
-
-        """
-        return self.dataset.get_pv_data_size()
-
     def get_memory_data_size(self, fleet=None):
         """
         Get memory data size, user can call this function to know the num
@@ -792,9 +796,9 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
+              import paddle
               from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset = paddle.distributed.InMemoryDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.load_into_memory()
@@ -829,9 +833,9 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
+              import paddle
               from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset = paddle.distributed.InMemoryDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.load_into_memory()
@@ -849,6 +853,51 @@ class InMemoryDataset(DatasetBase):
             return global_data_size[0]
         return local_data_size[0]
 
+    def _set_fea_eval(self, record_candidate_size, fea_eval=True):
+        """
+        set fea eval mode for slots shuffle to debug the importance level of
+        slots(features), fea_eval need to be set True for slots shuffle.
+        
+        Args:
+            record_candidate_size(int): size of instances candidate to shuffle 
+                                        one slot
+            fea_eval(bool): whether enable fea eval mode to enable slots shuffle.
+                            default is True.
+            
+        Examples:
+            .. code-block:: python
+
+            import paddle
+            dataset = paddle.distributed.InMemoryDataset()
+            dataset._set_fea_eval(1000000, True)
+
+        """
+        if fea_eval:
+            self.dataset.set_fea_eval(fea_eval, record_candidate_size)
+        self.fea_eval = fea_eval
+
+    def slots_shuffle(self, slots):
+        """
+        Slots Shuffle 
+        Slots Shuffle is a shuffle method in slots level, which is usually used 
+        in sparse feature with large scale of instances. To compare the metric, i.e.
+        auc while doing slots shuffle on one or several slots with baseline to 
+        evaluate the importance level of slots(features).
+        
+        Args:
+            slots(list[string]): the set of slots(string) to do slots shuffle.
+
+        Examples:
+            import paddle
+            dataset = paddle.distributed.InMemoryDataset()
+            dataset.set_merge_by_lineid()
+            #suppose there is a slot 0
+            dataset.slots_shuffle(['0'])
+        """
+        if self.fea_eval:
+            slots_set = set(slots)
+            self.dataset.slots_shuffle(slots_set)
+
 
 class QueueDataset(DatasetBase):
     """
@@ -857,19 +906,24 @@ class QueueDataset(DatasetBase):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
+          import paddle
+          dataset = paddle.distributed.QueueDataset()
 
     """
 
     def __init__(self):
         """
         Initialize QueueDataset
-        This class should be created by DatasetFactory
         """
         super(QueueDataset, self).__init__()
         self.proto_desc.name = "MultiSlotDataFeed"
 
+    def init(self, **kwargs):
+        """
+        should be called only once in user's python scripts to initialize setings of dataset instance
+        """
+        super(QueueDataset, self).init(**kwargs)
+
     def _prepare_to_run(self):
         """
         Set data_feed_desc/thread num/filelist before run,
@@ -881,57 +935,9 @@ class QueueDataset(DatasetBase):
             self.thread_num = 1
         self.dataset.set_thread_num(self.thread_num)
         self.dataset.set_filelist(self.filelist)
-        self.dataset.set_data_feed_desc(self.desc())
+        self.dataset.set_data_feed_desc(self._desc())
         self.dataset.create_readers()
 
-    def local_shuffle(self):
-        """
-        Local shuffle data.
-
-        Local shuffle is not supported in QueueDataset
-        NotImplementedError will be raised
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
-              dataset.local_shuffle()
-
-        Raises:
-            NotImplementedError: QueueDataset does not support local shuffle
-
-        """
-        raise NotImplementedError(
-            "QueueDataset does not support local shuffle, "
-            "please use InMemoryDataset for local_shuffle")
-
-    def global_shuffle(self, fleet=None):
-        """
-        Global shuffle data.
-
-        Global shuffle is not supported in QueueDataset
-        NotImplementedError will be raised
-
-        Args:
-            fleet(Fleet): fleet singleton. Default None.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-              dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
-              dataset.global_shuffle(fleet)
-
-        Raises:
-            NotImplementedError: QueueDataset does not support global shuffle
-
-        """
-        raise NotImplementedError(
-            "QueueDataset does not support global shuffle, "
-            "please use InMemoryDataset for global_shuffle")
-
 
 class FileInstantDataset(DatasetBase):
     """
@@ -940,35 +946,22 @@ class FileInstantDataset(DatasetBase):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          dataset = fluid.DatasetFactory.create_dataset("FileInstantDataset")
+          import paddle
+          dataset = paddle.distributed.fleet.FileInstantDataset()
     """
 
     def __init__(self):
         """
         Initialize FileInstantDataset
-        This class should be created by DatasetFactory
         """
         super(FileInstantDataset, self).__init__()
         self.proto_desc.name = "MultiSlotFileInstantDataFeed"
 
-    def local_shuffle(self):
+    def init(self, **kwargs):
         """
-        Local shuffle
-        FileInstantDataset does not support local shuffle
+        should be called only once in user's python scripts to initialize setings of dataset instance
         """
-        raise NotImplementedError(
-            "FileInstantDataset does not support local shuffle, "
-            "please use InMemoryDataset for local_shuffle")
-
-    def global_shuffle(self, fleet=None):
-        """
-        Global shuffle
-        FileInstantDataset does not support global shuffle
-        """
-        raise NotImplementedError(
-            "FileInstantDataset does not support global shuffle, "
-            "please use InMemoryDataset for global_shuffle")
+        super(FileInstantDataset, self).init(**kwargs)
 
 
 class BoxPSDataset(InMemoryDataset):
@@ -978,19 +971,119 @@ class BoxPSDataset(InMemoryDataset):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+          import paddle
+          dataset = paddle.distributed.fleet.BoxPSDataset()
     """
 
     def __init__(self):
         """
         Initialize BoxPSDataset
-        This class should be created by DatasetFactory
         """
         super(BoxPSDataset, self).__init__()
         self.boxps = core.BoxPS(self.dataset)
         self.proto_desc.name = "PaddleBoxDataFeed"
 
+    def init(self, **kwargs):
+        """
+        should be called only once in user's python scripts to initialize setings of dataset instance
+        """
+        super(BoxPSDataset, self).init(**kwargs)
+
+        rank_offset = kwargs.get("rank_offset", "")
+        self._set_rank_offset(rank_offset)
+        pv_batch_size = kwargs.get("pv_batch_size", 1)
+        self._set_pv_batch_size(pv_batch_size)
+        parse_logkey = kwargs.get("parse_logkey", False)
+        self._set_parse_logkey(parse_logkey)
+        merge_by_sid = kwargs.get("merge_by_sid", False)
+        self._set_merge_by_sid(merge_by_sid)
+        enable_pv_merge = kwargs.get("enable_pv_merge", False)
+        self._set_enable_pv_merge(enable_pv_merge)
+
+    def _set_rank_offset(self, rank_offset):
+        """
+        Set rank_offset for merge_pv. It set the message of Pv.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              dataset._set_rank_offset("rank_offset")
+
+        Args:
+            rank_offset(str): rank_offset's name
+
+        """
+        self.proto_desc.rank_offset = rank_offset
+
+    def _set_pv_batch_size(self, pv_batch_size):
+        """
+        Set pv batch size. It will be effective during enable_pv_merge
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              dataset._set_pv_batch_size(128)
+        Args:
+            pv_batch_size(int): pv batch size
+
+        """
+        self.proto_desc.pv_batch_size = pv_batch_size
+
+    def _set_parse_logkey(self, parse_logkey):
+        """
+        Set if Dataset need to parse logkey
+
+        Args:
+            parse_content(bool): if parse logkey or not
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              dataset._set_parse_logkey(True)
+
+        """
+        self.parse_logkey = parse_logkey
+
+    def _set_merge_by_sid(self, merge_by_sid):
+        """
+        Set if Dataset need to merge sid. If not, one ins means one Pv.
+
+        Args:
+            merge_by_sid(bool): if merge sid or not
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              dataset._set_merge_by_sid(True)
+
+        """
+        self.merge_by_sid = merge_by_sid
+
+    def _set_enable_pv_merge(self, enable_pv_merge):
+        """
+        Set if Dataset need to merge pv.
+
+        Args:
+            enable_pv_merge(bool): if enable_pv_merge or not
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              dataset._set_enable_pv_merge(True)
+
+        """
+        self.enable_pv_merge = enable_pv_merge
+
     def set_date(self, date):
         """
         Workaround for date
@@ -1008,8 +1101,8 @@ class BoxPSDataset(InMemoryDataset):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
               dataset.begin_pass()
         """
         self.boxps.begin_pass()
@@ -1021,8 +1114,8 @@ class BoxPSDataset(InMemoryDataset):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
               dataset.end_pass(True)
         """
         self.boxps.end_pass(need_save_delta)
@@ -1034,8 +1127,8 @@ class BoxPSDataset(InMemoryDataset):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.preload_into_memory()
@@ -1049,8 +1142,8 @@ class BoxPSDataset(InMemoryDataset):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.load_into_memory()
@@ -1064,8 +1157,8 @@ class BoxPSDataset(InMemoryDataset):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.preload_into_memory()
@@ -1093,11 +1186,90 @@ class BoxPSDataset(InMemoryDataset):
             slots(list[string]): the set of slots(string) to do slots shuffle.
 
         Examples:
-            import paddle.fluid as fluid
-            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+            import paddle
+            dataset = paddle.distributed.fleet.BoxPSDataset()
             dataset.set_merge_by_lineid()
             #suppose there is a slot 0
             dataset.slots_shuffle(['0'])
         """
         slots_set = set(slots)
         self.boxps.slots_shuffle(slots_set)
+
+    def set_current_phase(self, current_phase):
+        """
+        Set current phase in train. It is useful for untest.
+        current_phase : 1 for join, 0 for update.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              dataset.set_current_phase(1)
+
+        """
+        self.dataset.set_current_phase(current_phase)
+
+    def get_pv_data_size(self):
+        """
+        Get memory data size of Pv, user can call this function to know the pv num
+        of ins in all workers after load into memory.
+
+        Note:
+            This function may cause bad performance, because it has barrier
+
+        Returns:
+            The size of memory pv data.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              print dataset.get_pv_data_size()
+
+        """
+        return self.dataset.get_pv_data_size()
+
+    def preprocess_instance(self):
+        """
+        Merge pv instance and convey it from input_channel to input_pv_channel. 
+        It will be effective when enable_pv_merge_ is True.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              dataset.preprocess_instance()
+
+        """
+        self.dataset.preprocess_instance()
+
+    def postprocess_instance(self):
+        """
+        Divide pv instance and convey it to input_channel.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              dataset.preprocess_instance()
+              exe.train_from_dataset(dataset)
+              dataset.postprocess_instance()
+
+        """
+        self.dataset.postprocess_instance()
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 7778acaf83b310cfa9a04059ce6d3be2d5326089..6dba385c569be75b5b83e0a63e560ffa8ab73696 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -157,17 +157,20 @@ def get_cluster_from_args(args, gpus):
 
         free_ports = [x for x in range(start_port, start_port + len(gpus))]
 
-    return get_cluster(node_ips, node_ip, free_ports, gpus)
+    trainer_endpoints = []
+    for ip in node_ips:
+        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
+    return get_cluster(node_ips, node_ip, trainer_endpoints, gpus)
 
 
 def get_gpus(gpus):
     if gpus is None:
         gpus_num = fluid.core.get_cuda_device_count()
-        gpus = [str(x) for x in range(0, gpus_num)]
+        res_gpus = [str(x) for x in range(0, gpus_num)]
     else:
         cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
         if cuda_visible_devices is None or cuda_visible_devices == "":
-            gpus = [x.strip() for x in gpus.split(',')]
+            res_gpus = [x.strip() for x in gpus.split(',')]
         else:
             # change gpus into relative values
             # e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.gpus=4,5,6,7;
@@ -177,12 +180,16 @@ def get_gpus(gpus):
                 assert x in cuda_visible_devices_list, "Can't find "\
                 "your gpus %s in CUDA_VISIBLE_DEVICES[%s]."\
                 % (x, cuda_visible_devices)
-            gpus = [
+            res_gpus = [
                 cuda_visible_devices_list.index(x.strip())
                 for x in gpus.split(',')
             ]
+            logger.info("Change selected_gpus into reletive values. --ips:{} "
+                        "will change into relative_ips:{} according to your "
+                        "CUDA_VISIBLE_DEVICES:{}".format(
+                            gpus, res_gpus, cuda_visible_devices_list))
 
-    return gpus
+    return res_gpus
 
 
 def launch_collective(args):
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index 3da5aed8201ace6ccf9eed1ff322a7c6304de4a6..b6f4c75a276920f966a6b324a9bea16148bf337c 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -227,18 +227,23 @@ def get_logger(log_level=20, name="root"):
     return logger
 
 
-def get_cluster(node_ips, node_ip, paddle_ports, selected_gpus):
-    assert type(paddle_ports) is list, "paddle_ports must be list"
+def get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus):
+    assert type(trainer_endpoints) is list, "trainer_endpoints must be list"
     cluster = Cluster(hdfs=None)
     trainer_rank = 0
     for node_rank, ip in enumerate(node_ips):
         pod = Pod()
         pod.rank = node_rank
         pod.addr = ip
+        cur_node_endpoints = trainer_endpoints[node_rank]
+        # when use paddlecloud, endpoints may > selected_gpus(user_defined)
+        assert len(cur_node_endpoints) >= len(
+            selected_gpus
+        ), "current trainer_endpoints size should be greater equal than selected_gpus size."
         for i in range(len(selected_gpus)):
             trainer = Trainer()
             trainer.gpus.append(selected_gpus[i])
-            trainer.endpoint = "%s:%d" % (ip, paddle_ports[i])
+            trainer.endpoint = "%s" % (cur_node_endpoints[i])
             trainer.rank = trainer_rank
             trainer_rank += 1
 
@@ -347,12 +352,13 @@ def pretty_print_envs(envs, header=None):
     for k, v in envs.items():
         max_k = max(max_k, len(k))
 
-    h_format = "{{:^{}s}}{}{{:<{}s}}\n".format(max_k, " " * spacing, max_v)
-    l_format = "{{:<{}s}}{{}}{{:<{}s}}\n".format(max_k, max_v)
+    h_format = "    " + "|{{:>{}s}}{}{{:^{}s}}|\n".format(max_k, " " * spacing,
+                                                          max_v)
+    l_format = "    " + "|{{:>{}s}}{{}}{{:^{}s}}|\n".format(max_k, max_v)
     length = max_k + max_v + spacing
 
-    border = "".join(["="] * length)
-    line = "".join(["-"] * length)
+    border = "    +" + "".join(["="] * length) + "+"
+    line = "    +" + "".join(["-"] * length) + "+"
 
     draws = ""
     draws += border + "\n"
@@ -423,10 +429,6 @@ def start_local_trainers(cluster,
                             len(pod.trainers),
                             pretty_print_envs(proc_env, ("Distributed Envs",
                                                          "Value"))))
-            logger.info(
-                "More details for debug about commands and environments are written in {}/run.sh".
-                format(log_dir))
-
         fn = None
         if log_dir is not None:
             os.system("mkdir -p {}".format(log_dir))
diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
index b1952276e44cd1466bc443440505462924115ab7..31a9913701c3e08f5268d578d09c15f5bf8a86f8 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
@@ -34,6 +34,9 @@ class AMPOptimizer(MetaOptimizerBase):
             loss, role_maker, user_defined_optimizer, user_defined_strategy)
 
     def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
         if self.user_defined_strategy.amp:
             return True
         return False
@@ -42,6 +45,17 @@ class AMPOptimizer(MetaOptimizerBase):
         dist_strategy.amp = False
         dist_strategy.amp_configs = {}
 
+    def _enable_strategy(self, dist_strategy, context):
+        dist_strategy.amp = True
+        dist_strategy.amp_configs = {
+            "init_loss_scaling": 32768.0,
+            "incr_every_n_steps": 1000,
+            "decr_every_n_nan_or_inf": 2,
+            "incr_ratio": 2.0,
+            "decr_ratio": 8.0,
+            "use_dynamic_loss_scaling": True
+        }
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
index f1c6defc5c982c7d56980642898aaa333c199bbe..3f6ed1ed2f23d4595b3aadff6f259f9e27f129b2 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -53,6 +53,9 @@ class DGCOptimizer(MetaOptimizerBase):
             name=opt._name)
 
     def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
         if self.user_defined_strategy.dgc:
             if not isinstance(self.inner_opt, Momentum):
                 logging.warn("dgc only works on Momentum optimizer")
@@ -69,6 +72,10 @@ class DGCOptimizer(MetaOptimizerBase):
         dist_strategy.dgc = False
         dist_strategy.dgc_configs = {}
 
+    def _enable_strategy(self, dist_strategy, context):
+        dist_strategy.dgc = True
+        dist_strategy.dgc_configs = {"rampup_begin_step": 0, "rampup_step": 1}
+
     def backward(self,
                  loss,
                  startup_program=None,
@@ -85,5 +92,5 @@ class DGCOptimizer(MetaOptimizerBase):
                       no_grad_set=None):
         optimize_ops, params_grads = \
             self.dgc_opt.minimize(loss, startup_program,
-                                      parameter_list, no_grad_set)
+                                  parameter_list, no_grad_set)
         return optimize_ops, params_grads
diff --git a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
index 7db79ad7b5b7081172209faa2396d9f2a31bbdb3..f1b3680976541806d96ca815be64b03bcd499469 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
@@ -37,14 +37,21 @@ class GradientMergeOptimizer(MetaOptimizerBase):
             self.user_defined_strategy.gradient_merge_configs["avg"])
 
     def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
         can_apply = (self.user_defined_strategy.gradient_merge == True) and \
-                  self.user_defined_strategy.gradient_merge_configs["k_steps"] > 1
+            self.user_defined_strategy.gradient_merge_configs["k_steps"] > 1
         return can_apply
 
     def _disable_strategy(self, dist_strategy):
         dist_strategy.gradient_merge = False
         dist_strategy.gradient_merge_configs = {}
 
+    def _enable_strategy(self, dist_strategy, context):
+        # we currently do not support auto-enable gradient merge
+        return
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
index ace31687338f918ef260b3134b0bd429795542d0..6c1cc3d7a9769a5c61997ab761a5458b7e8df4a3 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
@@ -48,7 +48,7 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
                  callbacks=None):
         pass
 
-    # should fix the variable 
+    # should fix the variable
     def _setup_nccl_op(self, startup_program, main_program, build_strategy):
         trainer_endpoints = self.role_maker.get_trainer_endpoints()
         trainers = trainer_endpoints
@@ -94,31 +94,31 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
         dist_strategy = self.user_defined_strategy
         local_build_strategy = paddle.fluid.BuildStrategy()
         local_build_strategy.enable_sequential_execution = \
-                    dist_strategy.build_strategy.enable_sequential_execution
+            dist_strategy.build_strategy.enable_sequential_execution
         local_build_strategy.fuse_elewise_add_act_ops = \
-                    dist_strategy.build_strategy.fuse_elewise_add_act_ops
+            dist_strategy.build_strategy.fuse_elewise_add_act_ops
         local_build_strategy.fuse_bn_act_ops = \
-                    dist_strategy.build_strategy.fuse_bn_act_ops
+            dist_strategy.build_strategy.fuse_bn_act_ops
         local_build_strategy.enable_auto_fusion = \
-                    dist_strategy.build_strategy.enable_auto_fusion
+            dist_strategy.build_strategy.enable_auto_fusion
         local_build_strategy.fuse_relu_depthwise_conv = \
-                    dist_strategy.build_strategy.fuse_relu_depthwise_conv
+            dist_strategy.build_strategy.fuse_relu_depthwise_conv
         local_build_strategy.fuse_broadcast_ops = \
-                    dist_strategy.build_strategy.fuse_broadcast_ops
+            dist_strategy.build_strategy.fuse_broadcast_ops
         local_build_strategy.fuse_all_optimizer_ops = \
-                    dist_strategy.build_strategy.fuse_all_optimizer_ops
+            dist_strategy.build_strategy.fuse_all_optimizer_ops
         local_build_strategy.enable_inplace = \
-                    dist_strategy.build_strategy.enable_inplace
+            dist_strategy.build_strategy.enable_inplace
         local_build_strategy.use_hierarchical_allreduce = \
-                    dist_strategy.use_hierarchical_allreduce
+            dist_strategy.use_hierarchical_allreduce
         local_build_strategy.hierarchical_allreduce_inter_nranks = \
-                    dist_strategy.hierarchical_allreduce_inter_nranks
+            dist_strategy.hierarchical_allreduce_inter_nranks
         local_build_strategy.sync_batch_norm = \
-                    dist_strategy.sync_batch_norm
+            dist_strategy.sync_batch_norm
         local_build_strategy.fuse_all_reduce_ops = \
-                    dist_strategy.fuse_all_reduce_ops
+            dist_strategy.fuse_all_reduce_ops
         local_build_strategy.nccl_comm_num = \
-                    dist_strategy.nccl_comm_num
+            dist_strategy.nccl_comm_num
 
         if self.user_defined_strategy.recompute == True:
             logging.warn(
@@ -148,9 +148,6 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
 
         sync_allreduce = dist_strategy.sync_nccl_allreduce
         if sync_allreduce:
-            paddle.fluid.framework.set_flags({
-                "FLAGS_sync_nccl_allreduce": True
-            })
             exe_strategy.num_threads = local_build_strategy.nccl_comm_num + 1
             if local_build_strategy.use_hierarchical_allreduce:
                 exe_strategy.num_threads = 2 * local_build_strategy.nccl_comm_num + 1
@@ -191,7 +188,11 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
 
     def _disable_strategy(self, dist_strategy):
         # TODO(guru4elephant): should close all PE related flags here
-        pass
+        return
+
+    def _enable_strategy(self, dist_strategy, context):
+        # by default, graph execution strategy is enabled
+        return
 
     def minimize(self,
                  loss,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
index 9fa29c4078e9f579a740ef8c0591979e7fbb962d..df9887759e16fddb0579abdcdf3ef5f9024825e7 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
@@ -62,6 +62,9 @@ class LambOptimizer(MetaOptimizerBase):
             name=opt._name)
 
     def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
         if self.user_defined_strategy.lamb:
             if not isinstance(self.inner_opt, AdamOptimizer):
                 logging.warn(
@@ -75,6 +78,13 @@ class LambOptimizer(MetaOptimizerBase):
         dist_strategy.lamb = False
         dist_strategy.lamb_configs = {}
 
+    def _enable_strategy(self, dist_strategy, context):
+        dist_strategy.lamb = True
+        dist_strategy.lamb_configs = {
+            "lamb_weight_decay": 0.01,
+            "exclude_from_weight_decay": []
+        }
+
     def backward(self,
                  loss,
                  startup_program=None,
@@ -84,6 +94,10 @@ class LambOptimizer(MetaOptimizerBase):
         return self.lamb_opt.backward(loss, startup_program, parameter_list,
                                       no_grad_set, callbacks)
 
+    # the following function will be used by AMP if both LARS and AMP are turn on together.
+    def apply_gradients(self, params_grads):
+        return self.lamb_opt.apply_gradients(params_grads=params_grads)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
@@ -91,5 +105,5 @@ class LambOptimizer(MetaOptimizerBase):
                       no_grad_set=None):
         optimize_ops, params_grads = \
             self.lamb_opt.minimize(loss, startup_program,
-                                      parameter_list, no_grad_set)
+                                   parameter_list, no_grad_set)
         return optimize_ops, params_grads
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
index a7b856ff5b0dcb1ab30de82a12c91a2e1c14fe76..609d8b85e714c1c7247898f8d506f9dadab9f499 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
@@ -44,13 +44,19 @@ class LarsOptimizer(MetaOptimizerBase):
             parameter_list=opt._parameter_list,
             regularization=opt.regularization,
             grad_clip=opt._grad_clip,
-            name=opt._name)
+            name=opt._name,
+            exclude_from_weight_decay=configs['exclude_from_weight_decay'],
+            epsilon=configs['epsilon'])
 
     def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
         if self.user_defined_strategy.lars:
             if not isinstance(self.inner_opt, Momentum):
                 logging.warn(
-                    "lars need the inner optimizer to be Momentum optimizer.")
+                    "lars need the inner optimizer to be Momentum optimizer but got {}.".
+                    format(self.inner_opt.type))
                 return False
             return True
         return False
@@ -59,6 +65,13 @@ class LarsOptimizer(MetaOptimizerBase):
         dist_strategy.lars = False
         dist_strategy.lars_configs = {}
 
+    def _enable_strategy(self, dist_strategy, context):
+        dist_strategy.lars = True
+        dist_strategy.lars_configs = {
+            "lars_coeff": 0.01,
+            "lars_weight_decay": 0.0005,
+        }
+
     def backward(self,
                  loss,
                  startup_program=None,
@@ -68,6 +81,10 @@ class LarsOptimizer(MetaOptimizerBase):
         return self.lars_opt.backward(loss, startup_program, parameter_list,
                                       no_grad_set, callbacks)
 
+    # the following function will be used by AMP if both LARS and AMP are turn on together.
+    def apply_gradients(self, params_grads):
+        return self.lars_opt.apply_gradients(params_grads=params_grads)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
@@ -75,5 +92,5 @@ class LarsOptimizer(MetaOptimizerBase):
                       no_grad_set=None):
         optimize_ops, params_grads = \
             self.lars_opt.minimize(loss, startup_program,
-                                      parameter_list, no_grad_set)
+                                   parameter_list, no_grad_set)
         return optimize_ops, params_grads
diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
index e22127c13999bfde7aa753ad1a66536913ab04f9..6fa34d8d28a907d936500907db3e4c65ab4f4da8 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
@@ -14,8 +14,8 @@
 
 from __future__ import print_function
 
+import paddle
 from paddle.fluid import program_guard, layers, default_main_program
-from paddle.fluid.optimizer import Momentum, SGD
 from .meta_optimizer_base import MetaOptimizerBase
 from .common import OpRole, OP_ROLE_KEY, CollectiveHelper, is_update_op
 
@@ -29,19 +29,28 @@ class LocalSGDOptimizer(MetaOptimizerBase):
         self.snapshot_key = '@SNAPSHOT'
 
     def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
         if not self.user_defined_strategy.localsgd:
             return False
 
         if self.role_maker.worker_num() <= 1:
             return False
 
-        return isinstance(self.inner_opt, Momentum) \
-                or isinstance(self.inner_opt, SGD)
+        return isinstance(self.inner_opt, paddle.optimizer.momentum.Momentum) \
+            or isinstance(self.inner_opt, paddle.fluid.optimizer.Momentum) \
+            or isinstance(self.inner_opt, paddle.optimizer.sgd.SGD) \
+            or isinstance(self.inner_opt, paddle.fluid.optimizer.SGD)
 
     def _disable_strategy(self, dist_strategy):
         dist_strategy.localsgd = False
         dist_strategy.localsgd_configs = {}
 
+    def _enable_strategy(self, dist_strategy, context):
+        dist_strategy.localsgd = True
+        dist_strategy.localsgd_configs = {"k_steps": 1, "begin_step": 1}
+
     def snapshot_name(self, param_name):
         return param_name + self.snapshot_key
 
@@ -77,8 +86,9 @@ class LocalSGDOptimizer(MetaOptimizerBase):
         minimized = self.inner_opt.minimize(
             loss, startup_program=startup_program)
 
-        init_k_steps = self.user_defined_strategy.localsgd_configs['k_steps']
-        auto_steps = self.user_defined_strategy.auto
+        k_steps_value = self.user_defined_strategy.localsgd_configs['k_steps']
+        begin_step_value = self.user_defined_strategy.localsgd_configs[
+            'begin_step']
 
         if startup_program is None:
             startup_program = default_startup_program()
@@ -92,45 +102,28 @@ class LocalSGDOptimizer(MetaOptimizerBase):
 
         p2s = self.create_snapshot_vars(main_block.program)
         with program_guard(main_block.program, startup_program):
-            step = layers.autoincreased_step_counter(begin=0)
+            step = layers.autoincreased_step_counter(begin=1)
             k_steps = layers.create_global_var(
                 name="k_steps",
                 shape=[1],
-                value=init_k_steps,
+                value=k_steps_value,
+                dtype='int64',
+                persistable=True)
+
+            begin_step = layers.create_global_var(
+                name="begin_step",
+                shape=[1],
+                value=begin_step_value,
                 dtype='int64',
                 persistable=True)
+
             last_step = layers.create_global_var(
                 name="last_step",
                 shape=[1],
-                value=int(0),
+                value=begin_step_value,
                 dtype='int64',
                 persistable=True)
 
-            if auto_steps:
-                avg_loss = layers.collective._c_allreduce(
-                    loss) / self.role_maker.worker_num()
-
-                lr_0 = layers.create_global_var(
-                    name="lr_0",
-                    shape=[1],
-                    value=float(0),
-                    dtype='float32',
-                    persistable=True)
-                loss_0 = layers.create_global_var(
-                    name="loss_0",
-                    shape=[1],
-                    value=float(0),
-                    dtype='float32',
-                    persistable=True)
-
-                global_lr = self.inner_opt._global_learning_rate()
-
-                def initialize():
-                    layers.assign(loss, loss_0)
-                    layers.assign(global_lr, lr_0)
-
-                layers.cond(step == 0, initialize)
-
             def communicate():
                 sub_block = default_main_program().current_block()
                 ring_id = -1
@@ -186,20 +179,10 @@ class LocalSGDOptimizer(MetaOptimizerBase):
                         inputs={'X': [param]},
                         outputs={'Out': [snapshot]},
                         attrs={OP_ROLE_KEY: OpRole.Optimize})
-
-                if auto_steps:
-                    next_local_steps = layers.cast(
-                        layers.ceil(
-                            layers.sqrt(lr_0 * loss / (global_lr * loss_0) *
-                                        float(init_k_steps))),
-                        dtype='int64')
-                    max_local_steps = layers.fill_constant(
-                        shape=[1], dtype='int64', value=16)
-                    next_local_steps = layers.elementwise_min(next_local_steps,
-                                                              max_local_steps)
-                    layers.assign(next_local_steps, k_steps)
                 layers.assign(step, last_step)
 
-            layers.cond(step - last_step == k_steps, communicate)
+            def begin_localsgd():
+                layers.cond(step - last_step == k_steps, communicate)
 
+            layers.cond(step > begin_step, begin_localsgd, communicate)
         return minimized
diff --git a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
index 073148e11a0a2b08253b89d36d7a014b830518f8..a12ca50442b1c3499d62216d1fecc709f3351382 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
@@ -48,6 +48,10 @@ class MetaOptimizerBase(Optimizer):
         raise NotImplementedError("you should implement disable strategy in {}".
                                   format(type(self).__name__))
 
+    def _enable_strategy(self, dist_strategy, context=None):
+        raise NotImplementedError("you should implement enable strategy in {}".
+                                  format(type(self).__name__))
+
     def apply_gradients(self, params_grads):
         return self.inner_opt.apply_gradients(params_grads=params_grads)
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
index 878ed7422d733d3e2828e0395ec63ed16b4c489a..7dc532c86ea681d8479710732ec33e96c58c35d5 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
@@ -24,6 +24,9 @@ class ParameterServerGraphOptimizer(ParameterServerOptimizer):
         self.meta_optimizers_white_list = []
 
     def _can_apply(self):
+        if self.role_maker._is_collective:
+            return False
+
         k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
         if k_steps < 0:
             return False
@@ -37,7 +40,11 @@ class ParameterServerGraphOptimizer(ParameterServerOptimizer):
         return True
 
     def _disable_strategy(self, dist_strategy):
-        dist_strategy.a_sync_configs = {}
+        return
+
+    def _enable_strategy(self, dist_strategy, context):
+        # only open up the async mode for auto-parallel
+        return
 
     def _is_graph_out(self):
         return True
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
index ecb198bedf9041aa3ffc929a72cce3c209f03b61..51d4d343165b9057c803a22aa428081109d7d35f 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
@@ -13,6 +13,10 @@
 
 from paddle import fluid
 from .meta_optimizer_base import MetaOptimizerBase
+from paddle.fluid import core
+import subprocess
+import re
+import platform
 
 
 class ParameterServerOptimizer(MetaOptimizerBase):
@@ -28,6 +32,7 @@ class ParameterServerOptimizer(MetaOptimizerBase):
     def _can_apply(self):
         if self.role_maker._is_collective:
             return False
+
         k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
         return True if k_steps >= 0 else False
 
@@ -127,6 +132,95 @@ class ParameterServerOptimizer(MetaOptimizerBase):
 
         return _main, _startup
 
+    def _can_apply_geo(self, dist_strategy, program):
+        def get_sys_free_mem():
+            plat = platform.system()
+            if platform.system() == "Darwin":
+                vm = subprocess.Popen(
+                    ['vm_stat'], stdout=subprocess.PIPE).communicate()[0]
+                # Process vm_stat
+                vmLines = vm.split('\n')
+                sep = re.compile(':[\s]+')
+                vmStats = {}
+                for row in range(1, len(vmLines) - 2):
+                    rowText = vmLines[row].strip()
+                    rowElements = sep.split(rowText)
+                    vmStats[(rowElements[0]
+                             )] = int(rowElements[1].strip('\.')) * 4096
+                return vmStats["Pages free"]
+            elif platform.system() == "Linux":
+                mems = {}
+                with open('/proc/meminfo', 'rb') as f:
+                    for line in f:
+                        fields = line.split()
+                        mems[fields[0]] = int(fields[1]) * 1024
+                free = mems[b'MemFree:']
+                return free
+            else:
+                raise ValueError(
+                    "%s platform is unsupported is parameter server optimizer" %
+                    (platform.system()))
+
+        if not isinstance(self.inner_opt, fluid.optimizer.SGDOptimizer):
+            return False
+
+        free = get_sys_free_mem()
+
+        from paddle.fluid.incubate.fleet.parameter_server.ir import vars_metatools
+
+        processed_var_names = set(["@EMPTY@"])
+        param_memory_size = 0
+        for varname in program.global_block().vars:
+            var = program.global_block().vars[varname]
+            if not var.persistable or var.desc.type(
+            ) != core.VarDesc.VarType.LOD_TENSOR:
+                continue
+            param = vars_metatools.create_var_struct(var)
+            param_memory_size += param.m_size
+            processed_var_names.add(varname)
+
+        upper_mem_use = param_memory_size * 5.0
+
+        program_tmp_vars = dict()
+        eval_batch_size = 1024
+        for op in program.global_block().ops:
+            for var_name in op.output_arg_names:
+                if var_name in processed_var_names:
+                    continue
+                processed_var_names.add(var_name)
+                var = program.global_block().vars[var_name]
+
+                if var.desc.type() != core.VarDesc.VarType.LOD_TENSOR:
+                    continue
+
+                data_count = 1
+                neg_dim_count = 0
+                for x in var.shape:
+                    if x < 0:
+                        if neg_dim_count >= 1:
+                            raise ValueError(
+                                "Var %s has more than one negative dim." %
+                                (var_name))
+                        neg_dim_count += 1
+                        data_count *= (-x)
+                    else:
+                        data_count *= x
+                program_tmp_vars[var_name] = (
+                    data_count, neg_dim_count,
+                    vars_metatools.dtype_to_size[var.dtype])
+
+        for varname in program_tmp_vars:
+            data_count, neg_dim_count, type_size = program_tmp_vars[varname]
+            if neg_dim_count == 1:
+                data_count *= eval_batch_size
+            var_memory = data_count * type_size
+            upper_mem_use += var_memory
+
+        if upper_mem_use < free:
+            return True
+        else:
+            return False
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
@@ -143,6 +237,7 @@ class ParameterServerOptimizer(MetaOptimizerBase):
         compiled_config = public.CompileTimeStrategy(_origin_main_program,
                                                      _origin_startup_program,
                                                      strategy, self.role_maker)
+        compiled_config.strategy = strategy
 
         if self.role_maker.is_worker() or self.role_maker._is_heter_worker():
             main_program, startup_program = self._build_trainer_programs(
@@ -157,4 +252,24 @@ class ParameterServerOptimizer(MetaOptimizerBase):
         return None, None
 
     def _disable_strategy(self, dist_strategy):
-        self.user_defined_strategy.a_sync_configs = {}
+        dist_strategy.a_sync = False
+        a_sync_configs = dist_strategy.a_sync_configs
+        a_sync_configs["k_steps"] = -1
+        dist_strategy.a_sync_configs = a_sync_configs
+
+    def _enable_strategy(self, dist_strategy, context):
+        a_sync_configs = dist_strategy.a_sync_configs
+        if a_sync_configs["k_steps"] >= 0:
+            return
+
+        dist_strategy.a_sync = True
+        a_sync_configs = dist_strategy.a_sync_configs
+
+        is_geo = self._can_apply_geo(dist_strategy,
+                                     context["origin_main_program"])
+
+        if is_geo:
+            a_sync_configs["k_steps"] = 800
+        else:
+            a_sync_configs["k_steps"] = 0
+        dist_strategy.a_sync_configs = a_sync_configs
diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
index d5a45e2b4e1aeda2e1c66c0a5a36236622f093ec..87fa70779111ea485319f50b58901c605fffa23c 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -103,6 +103,9 @@ class PipelineOptimizer(MetaOptimizerBase):
         self.wrapped_opt = PO(self.inner_opt, num_microbatches=num_microbatches)
 
     def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
         if self.user_defined_strategy.pipeline == True:
             return True
         return False
@@ -111,6 +114,10 @@ class PipelineOptimizer(MetaOptimizerBase):
         dist_strategy.pipeline = False
         dist_strategy.pipeline_configs = {}
 
+    def _enable_strategy(self, dist_strategy, context):
+        # we do not support enable pipeline automatically right now
+        return
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
@@ -176,7 +183,7 @@ class PipelineOptimizer(MetaOptimizerBase):
         grad = None
         for idx, op in reversed(list(enumerate(block.ops))):
             if is_backward_op(op) and \
-                OP_ROLE_VAR_KEY in op.attr_names:
+                    OP_ROLE_VAR_KEY in op.attr_names:
                 op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY]
                 if len(op_role_var) == 0:
                     continue
diff --git a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
index 3eb3ca6127cfe0d0a7a458c6c44e09ce22e7b24a..59ca7e633099e8688a57fa9024575e29008c0341 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
@@ -38,6 +38,9 @@ class RecomputeOptimizer(MetaOptimizerBase):
             list(user_defined_strategy.recompute_configs["checkpoints"]))
 
     def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
         if self.user_defined_strategy.recompute == True:
             if len(self.user_defined_strategy.recompute_configs[
                     "checkpoints"]) == 0:
@@ -49,6 +52,10 @@ class RecomputeOptimizer(MetaOptimizerBase):
         dist_strategy.recompute = False
         dist_strategy.recompute_configs = {}
 
+    def _enable_strategy(self, dist_strategy, context):
+        # we do not support automatically recompute checkpoints currently
+        return
+
     def backward(self,
                  loss,
                  startup_program=None,
diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py
index e2ab321f9aebddd437c92ded9e6005495f760096..9b969cf3002379058b9cff0d604d2db750573028 100644
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -160,18 +160,21 @@ def get_cluster_from_args(args, selected_gpus):
             x for x in range(started_port, started_port + len(selected_gpus))
         ]
 
-    return get_cluster(node_ips, node_ip, free_ports, selected_gpus)
+    trainer_endpoints = []
+    for ip in node_ips:
+        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
+    return get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus)
 
 
 def get_gpus(selected_gpus):
     if selected_gpus is None:
         from paddle.fluid import core
         gpus_num = core.get_cuda_device_count()
-        selected_gpus = [str(x) for x in range(0, gpus_num)]
+        gpus = [str(x) for x in range(0, gpus_num)]
     else:
         cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
         if cuda_visible_devices is None or cuda_visible_devices == "":
-            selected_gpus = [x.strip() for x in selected_gpus.split(',')]
+            gpus = [x.strip() for x in selected_gpus.split(',')]
         else:
             # change selected_gpus into relative values
             # e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.selected_gpus=4,5,6,7;
@@ -181,12 +184,16 @@ def get_gpus(selected_gpus):
                 assert x in cuda_visible_devices_list, "Can't find "\
                 "your selected_gpus %s in CUDA_VISIBLE_DEVICES[%s]."\
                 % (x, cuda_visible_devices)
-            selected_gpus = [
+            gpus = [
                 cuda_visible_devices_list.index(x.strip())
                 for x in selected_gpus.split(',')
             ]
+            logger.info("Change selected_gpus into reletive values. --ips:{} "
+                        "will change into relative_ips:{} according to your "
+                        "CUDA_VISIBLE_DEVICES:{}".format(
+                            selected_gpus, gpus, cuda_visible_devices_list))
 
-    return selected_gpus
+    return gpus
 
 
 def get_cluster_and_pod(args):
diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index 1fa307c4d1b89d4033a8f8346b254177053e9dc0..be144a55b86200042f4d03b112071a374612b3a5 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -227,18 +227,23 @@ def get_logger(log_level, name="root"):
     return logger
 
 
-def get_cluster(node_ips, node_ip, paddle_ports, selected_gpus):
-    assert type(paddle_ports) is list, "paddle_ports must be list"
+def get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus):
+    assert type(trainer_endpoints) is list, "trainer_endpoints must be list"
     cluster = Cluster(hdfs=None)
     trainer_rank = 0
     for node_rank, ip in enumerate(node_ips):
         pod = Pod()
         pod.rank = node_rank
         pod.addr = ip
+        cur_node_endpoints = trainer_endpoints[node_rank]
+        # when use paddlecloud, endpoints may > selected_gpus(user_defined)
+        assert len(cur_node_endpoints) >= len(
+            selected_gpus
+        ), "current trainer_endpoints size should be greater equal than selected_gpus size."
         for i in range(len(selected_gpus)):
             trainer = Trainer()
             trainer.gpus.append(selected_gpus[i])
-            trainer.endpoint = "%s:%d" % (ip, paddle_ports[i])
+            trainer.endpoint = "%s" % (cur_node_endpoints[i])
             trainer.rank = trainer_rank
             trainer_rank += 1
 
@@ -253,7 +258,8 @@ def terminate_local_procs(procs):
     for p in procs:
         if p.proc.poll() is None:
             p.proc.terminate()
-            p.log_fn.close()
+            if p.log_fn:
+                p.log_fn.close()
             logger.debug("terminate process id:{}".format(p.proc.pid))
 
     #wait all process terminiated
diff --git a/python/paddle/distribution.py b/python/paddle/distribution.py
index 918ebce07825139fabe8ddd4c1e266dd04eb7f6d..35204affb3fd168b8bd137d78c3413a08885e2bb 100644
--- a/python/paddle/distribution.py
+++ b/python/paddle/distribution.py
@@ -138,7 +138,7 @@ class Distribution(object):
         convert value's dtype to be consistent with param's dtype.
 
         Args:
-            param (int|float|list|numpy.ndarray|Tensor): low and high in Uniform class, loc and scale in Normal class.
+            param (Tensor): low and high in Uniform class, loc and scale in Normal class.
             value (Tensor): The input tensor.
 
         Returns:
@@ -152,6 +152,7 @@ class Distribution(object):
                 )
                 return core.ops.cast(value, 'in_dtype', value.dtype,
                                      'out_dtype', param.dtype)
+            return value
 
         check_variable_and_dtype(value, 'value', ['float32', 'float64'],
                                  'log_prob')
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 898c7d295641863740288e3f4e1da39266bce183..d51cacd1a5cad53ef77b325e5380100c537e057e 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -1756,6 +1756,12 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
     op_path_dict = dict()
     op_path = _find_op_path_(block, targets, inputs, block_no_grad_set,
                              op_path_dict)
+
+    # find no grad var by op_path
+    no_grad_vars = _find_no_grad_vars(block, op_path, targets,
+                                      block_no_grad_set)
+    block_no_grad_set.update(no_grad_vars)
+
     no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
     grad_to_var = dict()
     grad_info_map = dict()
diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index 0e187d4174cd5cca65f79e4ab84b4cc32ecefd21..7b564b3f837c001673bdd272ba60edf31cde21fb 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -37,7 +37,7 @@ import warnings
 import inspect
 
 import numpy as np
-
+import paddle
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.layers import utils
 from ... import unique_name
@@ -56,7 +56,8 @@ __all__ = [
     'match_matrix_tensor', 'tree_conv', 'fused_embedding_seq_pool',
     'multiclass_nms2', 'search_pyramid_hash', 'shuffle_batch', 'partial_concat',
     'sparse_embedding', 'partial_sum', 'tdm_child', 'rank_attention',
-    'tdm_sampler', 'batch_fc', '_pull_box_extended_sparse', 'bilateral_slice'
+    'tdm_sampler', 'batch_fc', '_pull_box_extended_sparse', 'bilateral_slice',
+    'correlation'
 ]
 
 
@@ -1546,3 +1547,81 @@ def bilateral_slice(x, guide, grid, has_offset, name=None):
         attrs={'has_offset': has_offset},
         outputs={'Out': out})
     return out
+
+
+def correlation(x,
+                y,
+                pad_size,
+                kernel_size,
+                max_displacement,
+                stride1,
+                stride2,
+                corr_type_multiply=1):
+    """
+
+    This operation compute correlation of two tensor.
+    For more information of correlation, please refer to PWC-Net: 
+    CNNs for Optical Flow Using Pyramid, Warping, and Cost Volume 
+    <https://arxiv.org/pdf/1709.02371.pdf>_
+
+    Args:
+        x(Tensor): The input x is 4-D Tensor with shape [N, C, H, W]. The data type is float32 and float64.
+        y(Tensor): The input y is 4-D Tensor with shape [N, C, H, W]. The data type is float32 and float64.
+        pad_size(int): Pad size. The data type is int.
+        max_displacement(int): Max displacement. The data type is int.
+        stride1(int): stride size of x. The data type is int.
+        stride2(int): stride size of y. The data type is int.
+        corr_type_multiply(int, optional): The type of multiply. The data type is int. Default: 1.
+
+    Returns:
+        Tensor: The data type is same as input tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+
+            x1 = fluid.layers.data(name='x1',
+                               shape=x_shape,
+                               dtype=x_type,
+                               append_batch_size=False)
+            x2 = fluid.layers.data(name='x2',
+                                shape=x_shape,
+                                dtype=x_type,
+                                append_batch_size=False)
+
+
+            out = fluid.contrib.correlation(
+                            x1,
+                            x2,
+                            pad_size=4,
+                            kernel_size=1,
+                            max_displacement=4,
+                            stride1=1,
+                            stride2=1)
+
+    """
+
+    helper = LayerHelper("correlation", **locals())
+    output = helper.create_variable_for_type_inference(dtype=x.dtype)
+    if paddle.fluid.in_dygraph_mode():
+        attrs = ("pad_size", pad_size, "kernel_size", kernel_size,
+                 "max_displacement", max_displacement, "stride1", stride1,
+                 "stride2", stride2, "corr_type_multiply", corr_type_multiply)
+        output = getattr(core.ops, "correlation")(x, y, *attrs)
+    else:
+        helper.append_op(
+            type="correlation",
+            inputs={"Input1": x,
+                    "Input2": y},
+            attrs={
+                "pad_size": pad_size,
+                "kernel_size": kernel_size,
+                "max_displacement": max_displacement,
+                "stride1": stride1,
+                "stride2": stride2,
+                "corr_type_multiply": corr_type_multiply
+            },
+            outputs={"Output": output})
+    return output
diff --git a/python/paddle/fluid/contrib/mixed_precision/amp_nn.py b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4dc968ca0de44b01741bf1f1fbaac7a9a65287e
--- /dev/null
+++ b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
@@ -0,0 +1,124 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_type
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.framework import Variable
+
+__all__ = ['check_finite_and_unscale', 'update_loss_scaling']
+
+
+def check_finite_and_unscale(x, scale, name=None):
+    """
+    Check if input X contains all finite data, if yes, scale it by input Scale.
+
+    $$Out = X / scale$$
+
+    If any tensor in X contains Inf or Nan, the Out will generate a indicator.
+    FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of 
+    Out should not be used, and its data may not be deterministic. 
+    Otherwise, FoundInfinite will be 0 (False).
+    Args:
+        x(list|tuple): The input tensors of check_finite_and_unscale operator.
+        scale: The scale of check_finite_and_unscale operator.
+    """
+    check_type(x, 'x', (tuple, list), 'check_finite_and_unscale')
+    for e in x:
+        check_variable_and_dtype(e, "x", ['float32', 'float64'],
+                                 'check_finite_and_unscale')
+
+    helper = LayerHelper("check_finite_and_unscale", **locals())
+    found_inf = helper.create_variable_for_type_inference(dtype='bool')
+
+    inputs = {'X': x, 'Scale': scale}
+    outputs = {'Out': x, 'FoundInfinite': found_inf}
+    helper.append_op(
+        type='check_finite_and_unscale', inputs=inputs, outputs=outputs)
+
+    return x, found_inf
+
+
+def update_loss_scaling(x,
+                        found_inf,
+                        prev_loss_scaling,
+                        num_good_steps,
+                        num_bad_steps,
+                        incr_every_n_steps,
+                        decr_every_n_nan_or_inf,
+                        incr_ratio,
+                        decr_ratio,
+                        name=None):
+    """
+    Update loss scaling according to overall gradients. If all gradients is 
+    finite after incr_every_n_steps, loss scaling will increase by incr_ratio. 
+    Otherwise, loss scaling will decrease by decr_ratio after
+    decr_every_n_nan_or_inf steps and each step some gradients are infinite.
+
+    Args:
+        x(list|tuple): The input tensors of update_loss_scaling operator.
+        found_inf (Variable): A boolean variable indicates whether 
+                                     there is any infinite gradient.
+        prev_loss_scaling (Variable): Previous loss scaling.
+        num_good_steps (Variable): A variable accumulates good steps in which 
+                                   all gradients are finite.
+        num_bad_steps (Variable): A variable accumulates bad steps in which 
+                                  some gradients are infinite.
+        incr_every_n_steps (int): A variable represents increasing loss 
+                                       scaling every n consecutive steps with 
+                                       finite gradients.
+        decr_every_n_nan_or_inf (int): A variable represents decreasing 
+                                            loss scaling every n accumulated 
+                                            steps with nan or inf gradients.
+        incr_ratio(float): The multiplier to use when increasing the loss 
+                           scaling.
+        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
+                           loss scaling.
+    """
+
+    check_variable_and_dtype(prev_loss_scaling, "prev_loss_scaling",
+                             ['float32', 'float64'], "update_loss_scaling")
+    check_type(x, 'x', (tuple, list), 'update_loss_scaling')
+    for e in x:
+        check_variable_and_dtype(e, "x", ['float32', 'float64'],
+                                 'update_loss_scaling')
+        assert prev_loss_scaling.dtype == e.dtype, "The dtype of prev_loss_scaling should be equal to the dtype of x."
+
+    helper = LayerHelper("update_loss_scaling", **locals())
+
+    inputs = {
+        'X': x,
+        'FoundInfinite': found_inf,
+        'PrevLossScaling': prev_loss_scaling,
+        'InGoodSteps': num_good_steps,
+        'InBadSteps': num_bad_steps
+    }
+
+    outputs = {
+        'Out': x,
+        'LossScaling': prev_loss_scaling,
+        'OutGoodSteps': num_good_steps,
+        'OutBadSteps': num_bad_steps
+    }
+
+    attrs = {
+        'incr_every_n_steps': incr_every_n_steps,
+        'decr_every_n_nan_or_inf': decr_every_n_nan_or_inf,
+        'incr_ratio': incr_ratio,
+        'decr_ratio': decr_ratio,
+    }
+
+    helper.append_op(
+        type='update_loss_scaling', inputs=inputs, outputs=outputs, attrs=attrs)
+
+    return x
diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py
index bfbd2700ae10bac4ad37462b5d7844b90dd05bbe..c9112ac849ce0506b7afd941b2213710e06bd1c6 100644
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
@@ -17,9 +17,11 @@ from ... import default_startup_program
 from ... import layers
 from ... import unique_name
 from . import fp16_utils
-from .fp16_utils import update_loss_scaling, rewrite_program
+from .fp16_utils import rewrite_program
 from .fp16_utils import update_role_var_grad
 from .fp16_lists import AutoMixedPrecisionLists
+from .amp_nn import check_finite_and_unscale
+from .amp_nn import update_loss_scaling
 
 __all__ = ["decorate"]
 
@@ -67,10 +69,8 @@ class OptimizerWithMixedPrecision(object):
             persistable=True)
         self._use_dynamic_loss_scaling = use_dynamic_loss_scaling
         if self._use_dynamic_loss_scaling:
-            self._incr_every_n_steps = layers.fill_constant(
-                shape=[1], dtype='int32', value=incr_every_n_steps)
-            self._decr_every_n_nan_or_inf = layers.fill_constant(
-                shape=[1], dtype='int32', value=decr_every_n_nan_or_inf)
+            self._incr_every_n_steps = incr_every_n_steps
+            self._decr_every_n_nan_or_inf = decr_every_n_nan_or_inf
             self._incr_ratio = incr_ratio
             self._decr_ratio = decr_ratio
             self._num_good_steps = layers.create_global_var(
@@ -139,49 +139,46 @@ class OptimizerWithMixedPrecision(object):
         # Change the op_role_var attr for some ops, so that gradients
         # transferred across GPUs can be FP16.
         update_role_var_grad(self._train_program, self._params_grads)
-        scaled_params_grads = []
-        for p, g in self._params_grads:
-            with self._train_program._optimized_guard([p, g]):
-                scaled_g = g / self._loss_scaling
-                scaled_params_grads.append([p, scaled_g])
 
-        return scaled_params_grads
+        return self._params_grads
 
-    def apply_gradients(self, scaled_params_grads):
+    def apply_gradients(self, params_grads):
         """
         Check scaled gradients to determine whether to update loss scaling and update 
         parameters by their scaled gradients, 
   
         Args:
-            scaled_params_grads (list): A list of params and scaled grads.
+            params_grads (list): A list of params and scaled grads.
     
         Returns:
             A list of optimize operators.
         """
 
-        if self._use_dynamic_loss_scaling:
+        grads = [g for _, g in params_grads]
+        with self._train_program._optimized_guard(grads):
+            grads, found_inf = check_finite_and_unscale(
+                grads, self._loss_scaling, name="find_infinite_scale")
 
-            grads = [layers.reduce_sum(g) for [_, g] in scaled_params_grads]
-            all_grads = layers.concat(grads)
-            all_grads_sum = layers.reduce_sum(all_grads)
-            is_overall_finite = layers.isfinite(all_grads_sum)
-
-            update_loss_scaling(is_overall_finite, self._loss_scaling,
-                                self._num_good_steps, self._num_bad_steps,
-                                self._incr_every_n_steps,
-                                self._decr_every_n_nan_or_inf, self._incr_ratio,
-                                self._decr_ratio)
-
-            # apply_gradient append all ops in global block, thus we shouldn't
-            # apply gradient in the switch branch.
-            with layers.Switch() as switch:
-                with switch.case(is_overall_finite):
-                    pass
-                with switch.default():
-                    for _, g in scaled_params_grads:
-                        layers.assign(layers.zeros_like(g), g)
-
-        optimize_ops = self._optimizer.apply_gradients(scaled_params_grads)
+        if self._use_dynamic_loss_scaling:
+            with self._train_program._optimized_guard(grads):
+                grads = update_loss_scaling(
+                    grads,
+                    found_inf,
+                    self._loss_scaling,
+                    self._num_good_steps,
+                    self._num_bad_steps,
+                    self._incr_every_n_steps,
+                    self._decr_every_n_nan_or_inf,
+                    self._incr_ratio,
+                    self._decr_ratio,
+                    name="update_loss_scaling")
+
+        params_unscaled_grads = []
+        for pg, new_g in zip(params_grads, grads):
+            params_unscaled_grads.append((pg[0], new_g))
+        # apply_gradient append all ops in global block, thus we shouldn't
+        # apply gradient in the switch branch.
+        optimize_ops = self._optimizer.apply_gradients(params_unscaled_grads)
 
         return optimize_ops
 
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
index 328dafe6219adb3c6355de0bafc430c52725024f..0b142ff33de55f36410eb9c23cb75210fc9d6321 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@@ -328,77 +328,3 @@ def update_role_var_grad(main_prog, params_grads):
                 raise ValueError("The op {0} is not in program".format(op))
             block.desc._remove_op(op_idx, op_idx + 1)
         block._sync_with_cpp()
-
-
-def update_loss_scaling(is_overall_finite, prev_loss_scaling, num_good_steps,
-                        num_bad_steps, incr_every_n_steps,
-                        decr_every_n_nan_or_inf, incr_ratio, decr_ratio):
-    """
-    Update loss scaling according to overall gradients. If all gradients is 
-    finite after incr_every_n_steps, loss scaling will increase by incr_ratio. 
-    Otherwise, loss scaling will decrease by decr_ratio after
-    decr_every_n_nan_or_inf steps and each step some gradients are infinite.
-
-    Args:
-        is_overall_finite (Variable): A boolean variable indicates whether 
-                                     all gradients are finite.
-        prev_loss_scaling (Variable): Previous loss scaling.
-        num_good_steps (Variable): A variable accumulates good steps in which 
-                                   all gradients are finite.
-        num_bad_steps (Variable): A variable accumulates bad steps in which 
-                                  some gradients are infinite.
-        incr_every_n_steps (Variable): A variable represents increasing loss 
-                                       scaling every n consecutive steps with 
-                                       finite gradients.
-        decr_every_n_nan_or_inf (Variable): A variable represents decreasing 
-                                            loss scaling every n accumulated 
-                                            steps with nan or inf gradients.
-        incr_ratio(float): The multiplier to use when increasing the loss 
-                           scaling.
-        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
-                           loss scaling.
-    """
-    zero_steps = layers.fill_constant(shape=[1], dtype='int32', value=0)
-    with layers.Switch() as switch:
-        with switch.case(is_overall_finite):
-            should_incr_loss_scaling = layers.less_than(incr_every_n_steps,
-                                                        num_good_steps + 1)
-            with layers.Switch() as switch1:
-                with switch1.case(should_incr_loss_scaling):
-                    new_loss_scaling = prev_loss_scaling * incr_ratio
-                    loss_scaling_is_finite = layers.isfinite(new_loss_scaling)
-                    with layers.Switch() as switch2:
-                        with switch2.case(loss_scaling_is_finite):
-                            layers.assign(new_loss_scaling, prev_loss_scaling)
-                        with switch2.default():
-                            pass
-                    layers.assign(zero_steps, num_good_steps)
-                    layers.assign(zero_steps, num_bad_steps)
-
-                with switch1.default():
-                    layers.increment(num_good_steps)
-                    layers.assign(zero_steps, num_bad_steps)
-
-        with switch.default():
-            should_decr_loss_scaling = layers.less_than(decr_every_n_nan_or_inf,
-                                                        num_bad_steps + 1)
-            with layers.Switch() as switch3:
-                with switch3.case(should_decr_loss_scaling):
-                    new_loss_scaling = prev_loss_scaling * decr_ratio
-                    static_loss_scaling = \
-                        layers.fill_constant(shape=[1],
-                                             dtype='float32',
-                                             value=1.0)
-                    less_than_one = layers.less_than(new_loss_scaling,
-                                                     static_loss_scaling)
-                    with layers.Switch() as switch4:
-                        with switch4.case(less_than_one):
-                            layers.assign(static_loss_scaling,
-                                          prev_loss_scaling)
-                        with switch4.default():
-                            layers.assign(new_loss_scaling, prev_loss_scaling)
-                    layers.assign(zero_steps, num_good_steps)
-                    layers.assign(zero_steps, num_bad_steps)
-                with switch3.default():
-                    layers.assign(zero_steps, num_good_steps)
-                    layers.increment(num_bad_steps)
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index 5662284483bf529034e42178c8a431f6286e31b8..8d399c929018f08eb3d02e50981566705536bbf5 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -192,7 +192,6 @@ class ImperativeQuantAware(object):
         assert len(input_dtype) == len(
             feed), "The length of input_shape should be equal to  feed's."
 
-        prog_trans = dygraph.ProgramTranslator()
         with dygraph.guard():
             model.eval()
             input_vars = []
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
index 59dd9867abb95dea74e1cdc362b671e7d4120d70..e22c980b0a7c6030c5d6a2fbc4fd58d2ec66958a 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
@@ -209,15 +209,24 @@ class FakeQuantAbsMax(layers.Layer):
         return quant_out
 
 
-def _get_fake_quant_type(quant_type, name, moving_rate, quant_bits, dtype,
-                         quant_on_weight):
+def _get_fake_quant_type(quant_type, **kwargs):
+    call_args = {
+        "name": kwargs.get("name", None),
+        "quant_bits": kwargs.get("quant_bits", 8),
+        "dtype": kwargs.get("dtype", "float32")
+    }
+
+    if quant_type == 'abs_max':
+        call_args["quant_on_weight"] = kwargs.get("quant_on_weight", False)
+    elif quant_type == 'moving_average_abs_max':
+        call_args["moving_rate"] = kwargs.get("moving_rate", 0.9)
+
     fake_quant_map = {
-        'abs_max':
-        lambda: FakeQuantAbsMax(name, quant_bits, dtype, quant_on_weight),
-        'moving_average_abs_max':
-        lambda: FakeQuantMovingAverage(name, moving_rate, quant_bits, dtype)
+        'abs_max': FakeQuantAbsMax,
+        'moving_average_abs_max': FakeQuantMovingAverage
     }
-    return fake_quant_map[quant_type]()
+
+    return fake_quant_map[quant_type](**call_args)
 
 
 class QuantizedConv2D(layers.Layer):
@@ -247,11 +256,18 @@ class QuantizedConv2D(layers.Layer):
         self.bias = getattr(layer, 'bias')
         # For FakeQuant
         self._fake_quant_weight = _get_fake_quant_type(
-            weight_quantize_type, self.weight.name, moving_rate, weight_bits,
-            self._dtype, True)
+            weight_quantize_type,
+            name=self.weight.name,
+            moving_rate=moving_rate,
+            quant_bits=weight_bits,
+            dtype=self._dtype,
+            quant_on_weight=True)
         self._fake_quant_input = _get_fake_quant_type(
             activation_quantize_type,
-            layer.full_name(), moving_rate, activation_bits, self._dtype, False)
+            name=layer.full_name(),
+            moving_rate=moving_rate,
+            quant_bits=activation_bits,
+            dtype=self._dtype)
 
     def forward(self, input):
         quant_input = self._fake_quant_input(input)
@@ -326,11 +342,18 @@ class QuantizedLinear(layers.Layer):
         self.bias = getattr(layer, 'bias')
         # For FakeQuant
         self._fake_quant_weight = _get_fake_quant_type(
-            weight_quantize_type, self.weight.name, moving_rate, weight_bits,
-            self._dtype, True)
+            weight_quantize_type,
+            name=self.weight.name,
+            moving_rate=moving_rate,
+            quant_bits=weight_bits,
+            dtype=self._dtype,
+            quant_on_weight=True)
         self._fake_quant_input = _get_fake_quant_type(
             activation_quantize_type,
-            layer.full_name(), moving_rate, activation_bits, self._dtype, False)
+            name=layer.full_name(),
+            moving_rate=moving_rate,
+            quant_bits=activation_bits,
+            dtype=self._dtype)
 
     def forward(self, input):
         quant_input = self._fake_quant_input(input)
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index 244a621611060b87805846f1ea748615bcdde19a..ddbd99e16cebdfc839a8e96e44d4f96f02e70c55 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -143,7 +143,7 @@ class PostTrainingQuantization(object):
                  weight_quantize_type='channel_wise_abs_max',
                  optimize_model=False,
                  is_use_cache_file=False,
-                 cache_dir="./temp_post_training"):
+                 cache_dir=None):
         '''
         Constructor.
 
@@ -206,13 +206,8 @@ class PostTrainingQuantization(object):
                 `conv2d/depthwise_conv2d + bn`, the weights scale for all channel will
                 be different. In address this problem, fuse the pattern before
                 quantization. Default False.
-            is_use_cache_file(bool, optional): If set is_use_cache_file as False,
-                all temp data will be saved in memory. If set is_use_cache_file as True,
-                it will save temp data to disk. When the fp32 model is complex or
-                the number of calibrate data is large, we should set is_use_cache_file
-                as True. Defalut is False.
-            cache_dir(str, optional): When is_use_cache_file is True, set cache_dir as
-                the directory for saving temp data. Default is ./temp_post_training.
+            is_use_cache_file(bool, optional): This param is deprecated.
+            cache_dir(str, optional): This param is deprecated.
         Returns:
             None
 
@@ -302,10 +297,6 @@ class PostTrainingQuantization(object):
                 assert op_type in self._support_quantize_op_type, \
                     op_type + " is not supported for quantization."
         self._optimize_model = optimize_model
-        self._is_use_cache_file = is_use_cache_file
-        self._cache_dir = cache_dir
-        if self._is_use_cache_file and not os.path.exists(self._cache_dir):
-            os.mkdir(self._cache_dir)
 
         # Define variables
         self._place = self._executor.place
@@ -317,11 +308,17 @@ class PostTrainingQuantization(object):
         self._out_scale_op_list = _out_scale_op_list
         self._quantized_weight_var_name = set()
         self._quantized_act_var_name = set()
-        self.weight_op_pairs = {}
+        self._weight_op_pairs = {}
+        # The vars for alog = KL
+        self._sampling_act_abs_min_max = {}
+        self._sampling_act_histogram = {}
         self._sampling_data = {}
         self._quantized_var_kl_threshold = {}
+        self._histogram_bins = 2048
+        # The vars for algo = min_max
         self._quantized_var_min = {}
         self._quantized_var_max = {}
+        # The vars for algo = abs_max
         self._quantized_var_abs_max = {}
 
     def quantize(self):
@@ -339,6 +336,25 @@ class PostTrainingQuantization(object):
         self._collect_target_varnames()
         self._set_activation_persistable()
 
+        if self._algo == "KL":
+            _logger.info("Preparation stage ...")
+            batch_id = 0
+            for data in self._data_loader():
+                self._executor.run(program=self._program,
+                                   feed=data,
+                                   fetch_list=self._fetch_list,
+                                   return_numpy=False,
+                                   scope=self._scope)
+                self._collect_activation_abs_min_max()
+                if batch_id % 5 == 0:
+                    _logger.info("Run batch: " + str(batch_id))
+                batch_id += 1
+                if self._batch_nums and batch_id >= self._batch_nums:
+                    break
+            _logger.info("Finish preparation stage, all batch:" + str(batch_id))
+            self._init_sampling_act_histogram()
+
+        _logger.info("Sampling stage ...")
         batch_id = 0
         for data in self._data_loader():
             self._executor.run(program=self._program,
@@ -346,17 +362,13 @@ class PostTrainingQuantization(object):
                                fetch_list=self._fetch_list,
                                return_numpy=False,
                                scope=self._scope)
-            if self._algo == "KL":
-                self._sample_data(batch_id)
-            else:
-                self._sample_threshold()
-
+            self._sampling()
             if batch_id % 5 == 0:
                 _logger.info("Run batch: " + str(batch_id))
             batch_id += 1
             if self._batch_nums and batch_id >= self._batch_nums:
                 break
-        _logger.info("Finish all batch: " + str(batch_id))
+        _logger.info("Finish sampling stage, all batch: " + str(batch_id))
 
         self._reset_activation_persistable()
 
@@ -397,6 +409,7 @@ class PostTrainingQuantization(object):
             target_vars=self._fetch_list,
             executor=self._executor,
             main_program=self._program)
+        _logger.info("The quantized model is saved in " + save_model_path)
 
     def _load_model_data(self):
         '''
@@ -454,7 +467,7 @@ class PostTrainingQuantization(object):
             for var_name in var_name_list:
                 if var_name in persistable_var_names:
                     self._quantized_weight_var_name.add(var_name)
-                    self.weight_op_pairs[var_name] = op_type
+                    self._weight_op_pairs[var_name] = op_type
                 else:
                     self._quantized_act_var_name.add(var_name)
 
@@ -494,20 +507,18 @@ class PostTrainingQuantization(object):
             if var.name in self._quantized_act_var_name:
                 var.persistable = False
 
-    def _sample_threshold(self):
+    def _sampling(self):
         '''
-        Sample the input threshold(min, max, or abs_max) in every iterations.
+        Sample the min/max, abs_max or histogram in every iterations.
         '''
-        assert self._algo in ["abs_max", "min_max"], \
-            "The algo should be abs_max or min_max for _sample_threshold."
         if self._algo == "abs_max":
-            self._sample_threshold_abs_max()
+            self._sample_abs_max()
         elif self._algo == "min_max":
-            self._sample_threshold_min_max()
+            self._sample_min_max()
+        elif self._algo == "KL":
+            self._sample_histogram()
 
-    def _sample_threshold_abs_max(self):
-        assert self._algo == "abs_max", \
-            "The algo should be abs_max for _sample_threshold_abs_max."
+    def _sample_abs_max(self):
         # Only calculate abs_max value for weight for once
         if self._quantized_var_abs_max == {}:
             for var_name in self._quantized_weight_var_name:
@@ -516,7 +527,7 @@ class PostTrainingQuantization(object):
                     abs_max_value = float(np.max(np.abs(var_tensor)))
                 elif self._weight_quantize_type == "channel_wise_abs_max":
                     abs_max_value = []
-                    if self.weight_op_pairs[
+                    if self._weight_op_pairs[
                             var_name] in _channelwise_quant_axis1_ops:
                         for i in range(var_tensor.shape[1]):
                             abs_max_value.append(
@@ -534,9 +545,7 @@ class PostTrainingQuantization(object):
                 (abs_max_value > self._quantized_var_abs_max[var_name]):
                 self._quantized_var_abs_max[var_name] = abs_max_value
 
-    def _sample_threshold_min_max(self):
-        assert self._algo == "min_max", \
-            "The algo should be min_max for _sample_threshold_min_max."
+    def _sample_min_max(self):
         if self._quantized_var_min == {} and self._quantized_var_max == {}:
             for var_name in self._quantized_weight_var_name:
                 var_tensor = _load_variable_data(self._scope, var_name)
@@ -546,7 +555,7 @@ class PostTrainingQuantization(object):
                 elif self._weight_quantize_type == "channel_wise_abs_max":
                     min_value = []
                     max_value = []
-                    if self.weight_op_pairs[
+                    if self._weight_op_pairs[
                             var_name] in _channelwise_quant_axis1_ops:
                         for i in range(var_tensor.shape[1]):
                             min_value.append(float(np.min(var_tensor[:, i])))
@@ -569,6 +578,14 @@ class PostTrainingQuantization(object):
                 (max_value > self._quantized_var_max[var_name]):
                 self._quantized_var_max[var_name] = max_value
 
+    def _sample_histogram(self):
+        for var_name in self._quantized_act_var_name:
+            var_tensor = _load_variable_data(self._scope, var_name)
+            var_tensor_abs = np.abs(var_tensor)
+            bins = self._sampling_act_histogram[var_name][1]
+            hist, _ = np.histogram(var_tensor_abs, bins=bins)
+            self._sampling_act_histogram[var_name][0] += hist
+
     def _save_input_threhold(self):
         '''
         Save input threshold to the quantized op.
@@ -585,27 +602,36 @@ class PostTrainingQuantization(object):
                     op._set_attr(var_name + ".max",
                                  self._quantized_var_max[var_name])
 
-    def _sample_data(self, iter):
+    def _collect_activation_abs_min_max(self):
         '''
-        Sample the tensor data of quantized variables, 
-        applied in every iteration.
+        Collect the abs_min and abs_max for all activation. When algo = KL,
+        get the min and max value, and then calculate the threshold.
         '''
-        assert self._algo == "KL", "The algo should be KL to sample data."
-        if self._is_use_cache_file:
-            for var_name in self._quantized_act_var_name:
-                var_tensor = _load_variable_data(self._scope, var_name)
-                var_tensor = var_tensor.ravel()
-                save_path = os.path.join(
-                    self._cache_dir,
-                    var_name.replace("/", ".") + "_" + str(iter) + ".npy")
-                np.save(save_path, var_tensor)
-        else:
-            for var_name in self._quantized_act_var_name:
-                if var_name not in self._sampling_data:
-                    self._sampling_data[var_name] = []
-                var_tensor = _load_variable_data(self._scope, var_name)
-                var_tensor = var_tensor.ravel()
-                self._sampling_data[var_name].append(var_tensor)
+        for var_name in self._quantized_act_var_name:
+            var_tensor = _load_variable_data(self._scope, var_name)
+            var_tensor = np.abs(var_tensor)
+            min_value = float(np.min(var_tensor))
+            max_value = float(np.max(var_tensor))
+            if var_name not in self._sampling_act_abs_min_max:
+                self._sampling_act_abs_min_max[
+                    var_name] = [min_value, max_value]
+            else:
+                if min_value < self._sampling_act_abs_min_max[var_name][0]:
+                    self._sampling_act_abs_min_max[var_name][0] = min_value
+                if max_value > self._sampling_act_abs_min_max[var_name][1]:
+                    self._sampling_act_abs_min_max[var_name][1] = max_value
+
+    def _init_sampling_act_histogram(self):
+        '''
+        Based on the min/max value, init the sampling_act_histogram.
+        '''
+        for var_name in self._quantized_act_var_name:
+            if var_name not in self._sampling_act_histogram:
+                min_val = self._sampling_act_abs_min_max[var_name][0]
+                max_val = self._sampling_act_abs_min_max[var_name][1]
+                hist, hist_edeges = np.histogram(
+                    [], bins=self._histogram_bins, range=(min_val, max_val))
+                self._sampling_act_histogram[var_name] = [hist, hist_edeges]
 
     def _calculate_kl_threshold(self):
         '''
@@ -621,7 +647,7 @@ class PostTrainingQuantization(object):
                 weight_threshold = float(np.max(np.abs(weight_data)))
             elif self._weight_quantize_type == "channel_wise_abs_max":
                 weight_threshold = []
-                if self.weight_op_pairs[
+                if self._weight_op_pairs[
                         var_name] in _channelwise_quant_axis1_ops:
                     for i in range(weight_data.shape[1]):
                         weight_threshold.append(
@@ -632,25 +658,10 @@ class PostTrainingQuantization(object):
                             float(np.max(np.abs(weight_data[i]))))
             self._quantized_var_kl_threshold[var_name] = weight_threshold
 
-        # KL threshold for activations
-        if self._is_use_cache_file:
-            for var_name in self._quantized_act_var_name:
-                sampling_data = []
-                filenames = [f for f in os.listdir(self._cache_dir) \
-                    if re.match(var_name.replace("/", ".")  + '_[0-9]+.npy', f)]
-                for filename in filenames:
-                    file_path = os.path.join(self._cache_dir, filename)
-                    sampling_data.append(np.load(file_path))
-                    os.remove(file_path)
-                sampling_data = np.concatenate(sampling_data)
-                self._quantized_var_kl_threshold[var_name] = \
-                    self._get_kl_scaling_factor(np.abs(sampling_data))
-        else:
-            for var_name in self._quantized_act_var_name:
-                self._sampling_data[var_name] = np.concatenate(
-                    self._sampling_data[var_name])
-                self._quantized_var_kl_threshold[var_name] = \
-                    self._get_kl_scaling_factor(np.abs(self._sampling_data[var_name]))
+        for var_name in self._quantized_act_var_name:
+            hist, hist_edeges = self._sampling_act_histogram[var_name]
+            self._quantized_var_kl_threshold[var_name] = \
+                self._get_kl_scaling_factor(hist, hist_edeges)
 
     def _update_program(self):
         '''
@@ -765,22 +776,15 @@ class PostTrainingQuantization(object):
                 for var_name in out_var_names:
                     analysis_and_save_info(op, var_name)
 
-    def _get_kl_scaling_factor(self, activation_blob, num_quantized_bins=255):
+    def _get_kl_scaling_factor(self, hist, hist_edeges, num_quantized_bins=255):
         '''
         Using the KL-divergenc method to get the more precise scaling factor.
         '''
-        max_val = np.max(activation_blob)
-        min_val = np.min(activation_blob)
-        if min_val >= 0:
-            hist, hist_edeges = np.histogram(
-                activation_blob, bins=2048, range=(min_val, max_val))
-            ending_iter = 2047
-            starting_iter = int(ending_iter * 0.7)
-        else:
-            _logger.error("Please first apply abs to activation_blob.")
+        ending_iter = self._histogram_bins - 1
+        starting_iter = int(ending_iter * 0.7)
         bin_width = hist_edeges[1] - hist_edeges[0]
 
-        P_sum = len(np.array(activation_blob).ravel())
+        P_sum = np.sum(np.array(hist).ravel())
         min_kl_divergence = 0
         min_kl_index = 0
         kl_inited = False
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index 75e1ea43d15e432d2f6cbec271acd67624de1e01..dadc756c43ecc782a72c1c7d6626e00bc182f2c6 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -299,11 +299,14 @@ class Quant2Int8MkldnnPass(object):
         # Convert int8 range weights to fp32 range weights
         scales = self._weight_scales[output_var_name]
         weight = self._load_param(self._scope, weight_var_name)
-        assert scales.size == 1 or scales.size == len(
-            weight
-        ), "The size of weight scales vector ({}) does not match the number of output channels ({}) in the weights tensor {}.".format(
-            scales.size, len(weight), weight_var_name)
-        w_fp32 = np.divide(np.multiply(weight, self._s8_max).T, scales.T).T
+        if scales.size == 1 or scales.size == weight.shape[0]:
+            w_fp32 = np.divide(np.multiply(weight, self._s8_max).T, scales.T).T
+        elif len(weight.shape) > 1 and scales.size == weight.shape[1]:
+            w_fp32 = np.divide(np.multiply(weight, self._s8_max), scales)
+        else:
+            raise ValueError(
+                "The size of weight scales vector ({}) does not match the dimensions ({}) of the weights tensor {}."
+                .format(scales.size, weight.shape, weight_var_name))
         w_fp32 = w_fp32.reshape(weight.shape).astype(np.float32)
         self._restore_var(weight_var_name, w_fp32)
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
index fcbb1b66ad1fd73a152b9128fa75a152baecd223..7b51973131496172d61b7ad968417eb41fa11c08 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
@@ -43,7 +43,7 @@ class TestQuant2Int8MkldnnPass(unittest.TestCase):
         self.conv_output = np.ndarray(self.conv_output_size).astype(self.dtype)
         self.conv_output2 = np.ndarray(self.conv_output2_size).astype(
             self.dtype)
-        self.quantized_ops = 'conv2d'
+        self.quantized_ops = 'conv2d,mul'
         self.variables = {
             "input": self.input,
             "filter": self.filter,
@@ -51,6 +51,22 @@ class TestQuant2Int8MkldnnPass(unittest.TestCase):
             "conv_output": self.conv_output,
             "conv_output2": self.conv_output2,
         }
+        self.mul_input_size = [1, 3]
+        self.mul_weights_size = [3, 5]
+        self.mul_output_size = [1, 5]
+        self.mul_input = np.random.random(self.mul_input_size).astype(
+            self.dtype)
+        self.mul_weights = np.ones(self.mul_weights_size, self.dtype)
+        self.mul_weights_bad = np.ones([1, 1], self.dtype)
+        self.mul_output = np.ndarray(self.mul_output_size).astype(self.dtype)
+        self.mul_output_scale = np.linspace(1, 5, num=5).astype(self.dtype)
+
+        self.variables_mul = {
+            "mul_input": self.mul_input,
+            "mul_weights": self.mul_weights,
+            "mul_output": self.mul_output,
+            "mul_weights_bad": self.mul_weights_bad
+        }
 
     def prepare_program(self, program):
         block = program.global_block()
@@ -92,6 +108,23 @@ class TestQuant2Int8MkldnnPass(unittest.TestCase):
                 'fuse_brelu': True
             })
 
+    def prepare_program_mul(self, program):
+        block = program.global_block()
+        for name in self.variables_mul:
+            block.create_var(
+                name=name,
+                dtype="float32",
+                shape=self.variables_mul[name].shape)
+
+        mul_op1 = block.append_op(
+            type="mul",
+            inputs={
+                "X": block.var('mul_input'),
+                "Y": block.var('mul_weights')
+            },
+            outputs={"Out": block.var('mul_output')},
+            attrs={'use_mkldnn': self.use_mkldnn})
+
     def remove_fuse_activation_attribute(self, graph):
         for op in graph.all_op_nodes():
             op.op().remove_attr("fuse_activation")
@@ -103,11 +136,13 @@ class TestQuant2Int8MkldnnPass(unittest.TestCase):
 
     def check_graph_after_pass(self, graph):
         for op in graph.all_op_nodes():
-            self.assertTrue(op.op().has_attr("fuse_activation"))
-            if op.op().has_attr("fuse_relu") and op.op().attr("fuse_relu"):
-                self.assertTrue(op.op().attr("fuse_activation") == "relu")
-            if op.op().has_attr("fuse_brelu") and op.op().attr("fuse_brelu"):
-                self.assertTrue(op.op().attr("fuse_activation") == "relu6")
+            if op.op().type() == "conv2d":
+                self.assertTrue(op.op().has_attr("fuse_activation"))
+                if op.op().has_attr("fuse_relu") and op.op().attr("fuse_relu"):
+                    self.assertTrue(op.op().attr("fuse_activation") == "relu")
+                if op.op().has_attr("fuse_brelu") and op.op().attr(
+                        "fuse_brelu"):
+                    self.assertTrue(op.op().attr("fuse_activation") == "relu6")
 
     def test_quant_update_activation(self):
         program = fluid.Program()
@@ -125,6 +160,39 @@ class TestQuant2Int8MkldnnPass(unittest.TestCase):
             graph = quant2_int8_mkldnn_pass._update_activations(graph)
             self.check_graph_after_pass(graph)
 
+    def test_dequantize_op_weights(self):
+        program = fluid.Program()
+        with fluid.program_guard(program):
+            self.prepare_program_mul(program)
+            graph = IrGraph(core.Graph(program.desc), for_test=True)
+
+            for op in graph.all_op_nodes():
+                if op.op().type() == "mul":
+                    op_node = op
+                    break
+
+            qpass = Quant2Int8MkldnnPass(
+                self.quantized_ops,
+                _scope=self.scope,
+                _place=self.place,
+                _core=core,
+                _debug=False)
+            qpass._weight_scales["mul_output"] = self.mul_output_scale
+            param = self.scope.var("mul_weights").get_tensor()
+            param.set(self.variables_mul["mul_weights"], self.place)
+            qpass._dequantize_op_weights(graph, op_node, "Y", "Out")
+
+            assert np.allclose(
+                self.scope.find_var("mul_weights").get_tensor(),
+                [[127, 63.5, 42.3333, 31.75, 25.4],
+                 [127, 63.5, 42.3333, 31.75, 25.4],
+                 [127, 63.5, 42.3333, 31.75, 25.4]])
+
+            param = self.scope.var("mul_weights").get_tensor()
+            param.set(self.variables_mul["mul_weights_bad"], self.place)
+            with self.assertRaises(ValueError):
+                qpass._dequantize_op_weights(graph, op_node, "Y", "Out")
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/contrib/tests/test_correlation.py b/python/paddle/fluid/contrib/tests/test_correlation.py
new file mode 100644
index 0000000000000000000000000000000000000000..7fcef4dbcd1efd3655b6339ed5ec880d8cd33fc0
--- /dev/null
+++ b/python/paddle/fluid/contrib/tests/test_correlation.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.base import to_variable
+
+
+def corr(x_1,
+         x_2,
+         pad_size=4,
+         kernel_size=1,
+         max_displacement=4,
+         stride1=1,
+         stride2=1,
+         corr_multiply=1):
+    K = kernel_size
+
+    rinput1 = np.pad(x_1, ((0, 0), (0, 0), (pad_size, pad_size),
+                           (pad_size, pad_size)),
+                     mode='constant')
+    rinput2 = np.pad(x_2, ((0, 0), (0, 0), (pad_size, pad_size),
+                           (pad_size, pad_size)),
+                     mode='constant')
+    rinput1 = np.transpose(rinput1, (0, 2, 3, 1))
+    rinput2 = np.transpose(rinput2, (0, 2, 3, 1))
+    B = int(rinput1.shape[0])
+    H = int(x_1.shape[2])
+    W = int(x_2.shape[3])
+    d = max_displacement
+    D = 2 * d + 1
+    output = np.zeros((B, D * D, H, W), dtype=np.float32)
+
+    for b in range(B):
+        for i in range(H):
+            for j in range(W):
+                for k in range(-d, d + 1):
+                    for l in range(-d, d + 1):
+                        x1_index = i + pad_size
+                        y1_index = j + pad_size
+                        x2_index = x1_index + k
+                        y2_index = y1_index + l
+                        output[b, l + d + D * (k + d), i, j] = np.mean(
+                            rinput1[b, x1_index:x1_index + K, y1_index:y1_index
+                                    + K] * rinput2[b, x2_index:x2_index + K,
+                                                   y2_index:y2_index + K])
+
+    return output
+
+
+class TestCorrelationOp(unittest.TestCase):
+    def test_check_output(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+        np.random.seed(13)
+        np.set_printoptions(threshold=np.inf)
+        x_shape = (2, 10, 3, 3)
+        x_type = 'float32'
+        x1 = fluid.layers.data(
+            name='x1',
+            shape=x_shape,
+            dtype=x_type,
+            append_batch_size=False,
+            stop_gradient=False)
+        x2 = fluid.layers.data(
+            name='x2',
+            shape=x_shape,
+            dtype=x_type,
+            append_batch_size=False,
+            stop_gradient=False)
+
+        x1_np = np.random.randn(2, 3, 4, 5).astype(x_type)
+        x2_np = np.random.randn(2, 3, 4, 5).astype(x_type)
+        out_np = corr(
+            x1_np,
+            x2_np,
+            pad_size=4,
+            kernel_size=1,
+            max_displacement=4,
+            stride1=1,
+            stride2=1)
+
+        out = fluid.contrib.correlation(
+            x1,
+            x2,
+            pad_size=4,
+            kernel_size=1,
+            max_displacement=4,
+            stride1=1,
+            stride2=1)
+
+        loss = fluid.layers.reduce_mean(out)
+        optimizer = fluid.optimizer.Momentum(0.0001, 0.9)
+        optimizer.minimize(loss)
+
+        place = fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        res = exe.run(feed={'x1': x1_np,
+                            'x2': x2_np},
+                      fetch_list=[out.name, loss.name])
+
+        self.assertTrue(np.allclose(res[0], out_np))
+
+
+class Net(fluid.dygraph.Layer):
+    def __init__(self, name_scope):
+        super(Net, self).__init__(name_scope)
+
+    def forward(self, x1, x2):
+        y = fluid.contrib.correlation(
+            x1,
+            x2,
+            pad_size=4,
+            kernel_size=1,
+            max_displacement=4,
+            stride1=1,
+            stride2=1)
+        return y
+
+
+class TestCorrelationOpDyGraph(unittest.TestCase):
+    def test_check_output(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+        np.random.seed(13)
+        np.set_printoptions(threshold=np.inf)
+        x_shape = (2, 10, 3, 3)
+        x_type = 'float32'
+        place = fluid.CUDAPlace(0)
+        with fluid.dygraph.guard(place):
+            x1_np = np.random.randn(2, 3, 4, 5).astype(x_type)
+            x2_np = np.random.randn(2, 3, 4, 5).astype(x_type)
+            out_np = corr(
+                x1_np,
+                x2_np,
+                pad_size=4,
+                kernel_size=1,
+                max_displacement=4,
+                stride1=1,
+                stride2=1)
+
+            x1 = to_variable(x1_np)
+            x2 = to_variable(x2_np)
+            corr_pd = Net('corr_pd')
+            y = corr_pd(x1, x2)
+            out = y.numpy()
+            self.assertTrue(np.allclose(out, out_np))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index 6a996493e4df1e1facc6ccd205a8ae5105f92c5b..1ef0d494e0725084b0ddfddcafe93d49da0525d7 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -347,6 +347,92 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
         return self.__next__()
 
 
+# NOTE(chenweihang): _worker_loop must be top level method to be pickled
+def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
+                 collate_fn, init_fn, worker_id, num_workers,
+                 use_shared_memory):
+    try:
+        # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
+        # some shared memory objects may have been applied for but have not yet
+        # been put into the inter-process Queue. This part of the object needs
+        # to be cleaned up when the process ends.
+        CleanupFuncRegistrar.register(_cleanup_mmap)
+
+        # set signal handler
+        core._set_process_signal_handler()
+
+        global _worker_info
+        _worker_info = WorkerInfo(
+            id=worker_id, num_workers=num_workers, dataset=dataset)
+
+        init_exception = None
+        try:
+            if init_fn is not None:
+                init_fn(worker_id)
+            fetcher = _DatasetKind.create_fetcher(dataset_kind, dataset,
+                                                  collate_fn, True)
+        except:
+            init_exception = Exception("init_fn failed in worker {}: " \
+                                    "{}".format(worker_id, sys.exc_info()))
+
+        iterator_drained = False
+        parent_watch_dog = ParentWatchDog()
+
+        while parent_watch_dog.is_alive():
+            try:
+                data = indices_queue.get(MP_INDICES_CHECK_INTERVAL)
+            except queue.Empty:
+                continue
+
+            # None as poison piil, so worker event should be set
+            if data is None:
+                assert done_event.is_set() or iterator_drained, \
+                        "get None when worker done_event set"
+                break
+            # If worker done event is set but get still get data in
+            # indices_queue, remaining data should be get and skipped.
+            if done_event.is_set() or iterator_drained:
+                continue
+
+            idx, indices = data
+            try:
+                if init_exception is not None:
+                    batch = init_exception
+                    init_exception = None
+                else:
+                    batch = fetcher.fetch(indices)
+            except Exception as e:
+                if isinstance(
+                        e, StopIteration) and dataset_kind == _DatasetKind.ITER:
+                    out_queue.put(_IterableDatasetStopIteration(worker_id))
+                    iterator_drained = True
+                else:
+                    out_queue.put((idx, e))
+            else:
+                if use_shared_memory:
+                    # FIXME(dkp): _convert_to_tensor_list only support np.array
+                    #             list now, should support paddle.Tensor list
+                    if isinstance(batch[0][0], paddle.Tensor):
+                        np_batch = []
+                        for sample in batch:
+                            np_batch.append([s.numpy() for s in sample])
+                        batch = np_batch
+
+                    tensor_list = core._convert_to_tensor_list(batch)
+                    out_queue.put((idx, tensor_list))
+                    core._remove_tensor_list_mmap_fds(tensor_list)
+                else:
+                    out_queue.put((idx, batch))
+    except KeyboardInterrupt:
+        # NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process
+        pass
+    except:
+        six.reraise(*sys.exc_info())
+    finally:
+        if use_shared_memory:
+            _cleanup_mmap()
+
+
 class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
     def __init__(self, loader):
         super(_DataLoaderIterMultiProcess, self).__init__(loader)
@@ -404,11 +490,11 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
             indices_queue = multiprocessing.Queue()
             self._indices_queues.append(indices_queue)
             worker = multiprocessing.Process(
-                target=self._worker_loop,
+                target=_worker_loop,
                 args=(self._dataset, self._dataset_kind, indices_queue,
                       self._data_queue, self._workers_done_event,
                       self._collate_fn, self._worker_init_fn, i,
-                      self._num_workers))
+                      self._num_workers, self._use_shared_memory))
             worker.daemon = True
             worker.start()
             self._workers.append(worker)
@@ -483,90 +569,6 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
         self._blocking_queue.kill()
         logging.error("DataLoader reader thread raised an exception!")
 
-    def _worker_loop(self, dataset, dataset_kind, indices_queue, out_queue,
-                     done_event, collate_fn, init_fn, worker_id, num_workers):
-        try:
-            # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
-            # some shared memory objects may have been applied for but have not yet
-            # been put into the inter-process Queue. This part of the object needs
-            # to be cleaned up when the process ends.
-            CleanupFuncRegistrar.register(_cleanup_mmap)
-
-            # set signal handler
-            core._set_process_signal_handler()
-
-            global _worker_info
-            _worker_info = WorkerInfo(
-                id=worker_id, num_workers=num_workers, dataset=dataset)
-
-            init_exception = None
-            try:
-                if init_fn is not None:
-                    init_fn(worker_id)
-                fetcher = _DatasetKind.create_fetcher(dataset_kind, dataset,
-                                                      collate_fn, True)
-            except:
-                init_exception = Exception("init_fn failed in worker {}: " \
-                                     "{}".format(worker_id, sys.exc_info()))
-
-            iterator_drained = False
-            parent_watch_dog = ParentWatchDog()
-
-            while parent_watch_dog.is_alive():
-                try:
-                    data = indices_queue.get(MP_INDICES_CHECK_INTERVAL)
-                except queue.Empty:
-                    continue
-
-                # None as poison piil, so worker event should be set
-                if data is None:
-                    assert done_event.is_set() or iterator_drained, \
-                            "get None when worker done_event set"
-                    break
-                # If worker done event is set but get still get data in
-                # indices_queue, remaining data should be get and skipped.
-                if done_event.is_set() or iterator_drained:
-                    continue
-
-                idx, indices = data
-                try:
-                    if init_exception is not None:
-                        batch = init_exception
-                        init_exception = None
-                    else:
-                        batch = fetcher.fetch(indices)
-                except Exception as e:
-                    if isinstance(
-                            e,
-                            StopIteration) and dataset_kind == _DatasetKind.ITER:
-                        out_queue.put(_IterableDatasetStopIteration(worker_id))
-                        iterator_drained = True
-                    else:
-                        out_queue.put((idx, e))
-                else:
-                    if self._use_shared_memory:
-                        # FIXME(dkp): _convert_to_tensor_list only support np.array
-                        #             list now, should support paddle.Tensor list
-                        if isinstance(batch[0][0], paddle.Tensor):
-                            np_batch = []
-                            for sample in batch:
-                                np_batch.append([s.numpy() for s in sample])
-                            batch = np_batch
-
-                        tensor_list = core._convert_to_tensor_list(batch)
-                        out_queue.put((idx, tensor_list))
-                        core._remove_tensor_list_mmap_fds(tensor_list)
-                    else:
-                        out_queue.put((idx, batch))
-        except KeyboardInterrupt:
-            # NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process
-            pass
-        except:
-            six.reraise(*sys.exc_info())
-        finally:
-            if self._use_shared_memory:
-                _cleanup_mmap()
-
     def _thread_loop(self):
         while not self._thread_done_event.is_set():
             batch = self._get_data()
diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py
index 8f3ca9ec007ef5c1ab8769dde741a5d2b3697600..ff57f30dcd2ec73d55ff06e751767deea0a2eead 100644
--- a/python/paddle/fluid/dygraph/amp/loss_scaler.py
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
@@ -210,13 +210,12 @@ class AmpScaler(object):
     def _unscale(self, optimizer):
         if not self._enable:
             return
-        inv_scale = 1.0 / self._scale
         param_grads = [
             param._grad_ivar() for param in optimizer._parameter_list
             if param._grad_ivar() is not None
         ]
-        core.ops.amp_check_finite_and_scale(param_grads, inv_scale, param_grads,
-                                            self._found_inf)
+        core.ops.check_finite_and_unscale(param_grads, self._scale, param_grads,
+                                          self._found_inf)
 
     def _update(self):
         """
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index c548bdfeba19510b26c0f80d356fa6a6b7bbaed7..2f95c2b9007a53483fda86dda8d77e9baff0d8d2 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -25,6 +25,7 @@ from .tracer import Tracer
 import logging
 import objgraph
 from ..data_feeder import convert_dtype
+import warnings
 
 __all__ = [
     'no_grad', 'no_grad_', 'grad', 'guard', 'enable_dygraph', 'disable_dygraph',
@@ -609,10 +610,10 @@ def to_variable(value, name=None, zero_copy=None, dtype=None):
             uint8, uint16, complex64, complex128}.
         name(str, optional): The default value is None. Normally there is no 
             need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` .
+            refer to :ref:`api_guide_Name` . 
         zero_copy(bool, optional): Whether to share memory with the input numpy 
             array. This parameter only works with CPUPlace and will be set to 
-            True when it is None. Default: None.
+            True when it is None. Default: None. (Note: zero_copy is discarded temporally for some reason.)
         dtype(str, optional): The desired data type of returned ``Variable`` .
             Can be 'bool' , 'float16' , 'float32' , 'float64' , 'int8' , 'int16' , 
             'int32' , 'int64' , 'uint8' . Default: None.
@@ -665,8 +666,17 @@ def to_variable(value, name=None, zero_copy=None, dtype=None):
     else:
         if isinstance(framework._current_expected_place(),
                       framework.core.CPUPlace):
-            if zero_copy is None:
-                zero_copy = True
+            #TODO(zhiqiu): we found two problems when enable zero_copy on CPUPlace.
+            # (1): eigen requires 16-bytes alignments, but the data of numpy array may not statisfy. 
+            # Details: https://eigen.tuxfamily.org/dox/group__TopicUnalignedArrayAssert.html
+            # (2): when used in flask framework, it may result in hang.
+            # Details: https://github.com/PaddlePaddle/Paddle/issues/26635
+            # So, we temporally diable the zero_copy strategy.
+            if zero_copy == True:
+                warnings.warn(
+                    "Currently, zero_copy is not supported, and it will be discarded."
+                )
+                zero_copy = False
         else:
             assert not zero_copy, "zero_copy mode can only be used with CPUPlace"
 
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index f85b184f68111bbc0930b36e2ba6e05c2dbd006a..9876fc620b870f47b10e9f99e4de34f5cb81fde1 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -24,8 +24,8 @@ from . import learning_rate_scheduler
 import warnings
 from .. import core
 from .base import guard
-from paddle.fluid.dygraph.jit import SaveLoadConfig
-from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers
+from paddle.fluid.dygraph.jit import SaveLoadConfig, deprecate_save_load_configs
+from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers, EXTRA_VAR_INFO_FILENAME
 
 __all__ = [
     'save_dygraph',
@@ -42,9 +42,9 @@ def deprecate_keep_name_table(func):
             warnings.warn(
                 "The argument `keep_name_table` has deprecated, please use `SaveLoadConfig.keep_name_table`.",
                 DeprecationWarning)
-            configs = SaveLoadConfig()
-            configs.keep_name_table = keep_name_table
-            return configs
+            config = SaveLoadConfig()
+            config.keep_name_table = keep_name_table
+            return config
 
         # deal with arg `keep_name_table`
         if len(args) > 1 and isinstance(args[1], bool):
@@ -52,7 +52,7 @@ def deprecate_keep_name_table(func):
             args[1] = __warn_and_build_configs__(args[1])
         # deal with kwargs
         elif 'keep_name_table' in kwargs:
-            kwargs['configs'] = __warn_and_build_configs__(kwargs[
+            kwargs['config'] = __warn_and_build_configs__(kwargs[
                 'keep_name_table'])
             kwargs.pop('keep_name_table')
         else:
@@ -135,8 +135,9 @@ def save_dygraph(state_dict, model_path):
 # TODO(qingqing01): remove dygraph_only to support loading static model.
 # maybe need to unify the loading interface after 2.0 API is ready.
 # @dygraph_only
+@deprecate_save_load_configs
 @deprecate_keep_name_table
-def load_dygraph(model_path, configs=None):
+def load_dygraph(model_path, config=None):
     '''
     :api_attr: imperative
     
@@ -151,7 +152,7 @@ def load_dygraph(model_path, configs=None):
     Args:
         model_path(str) : The file prefix store the state_dict. 
             (The path should Not contain suffix '.pdparams') 
-        configs (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig`
+        config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig`
             object that specifies additional configuration options, these options 
             are for compatibility with ``jit.save/io.save_inference_model`` formats. 
             Default None.
@@ -195,6 +196,7 @@ def load_dygraph(model_path, configs=None):
     opti_file_path = model_prefix + ".pdopt"
 
     # deal with argument `configs`
+    configs = config
     if configs is None:
         configs = SaveLoadConfig()
 
@@ -231,6 +233,19 @@ def load_dygraph(model_path, configs=None):
             para_dict = dict()
             for var_name in persistable_var_dict:
                 para_dict[var_name] = persistable_var_dict[var_name].numpy()
+
+            # if __variables.info__ exists, we can recover structured_name
+            var_info_path = os.path.join(model_prefix, EXTRA_VAR_INFO_FILENAME)
+            if os.path.exists(var_info_path):
+                with open(var_info_path, 'rb') as f:
+                    extra_var_info = pickle.load(f)
+                structured_para_dict = dict()
+                for var_name in para_dict:
+                    structured_name = extra_var_info[var_name].get(
+                        'structured_name', None)
+                    assert structured_name is not None, "Cannot find saved variable (%s)'s structured name in saved model." % var_name
+                    structured_para_dict[structured_name] = para_dict[var_name]
+                para_dict = structured_para_dict
     else:
         # Load state dict by `save_dygraph` save format
         para_dict = {}
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/error.py b/python/paddle/fluid/dygraph/dygraph_to_static/error.py
index 5aba7ca0fdc0cfda5d79f5a66d78785df49c0baf..be21ab6d5394ed5f89c23988a9405b57e05b56fb 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/error.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/error.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+import six
 import sys
 import traceback
 
@@ -20,6 +21,14 @@ from paddle.fluid.dygraph.dygraph_to_static.origin_info import Location, OriginI
 
 ERROR_DATA = "Error data about original source code information and traceback."
 
+# A flag to set whether to open the dygraph2static error reporting module
+SIMPLIFY_ERROR_ENV_NAME = "TRANSLATOR_SIMPLIFY_NEW_ERROR"
+DEFAULT_SIMPLIFY_NEW_ERROR = 1
+
+# A flag to set whether to display the simplified error stack
+DISABLE_ERROR_ENV_NAME = "TRANSLATOR_DISABLE_NEW_ERROR"
+DEFAULT_DISABLE_NEW_ERROR = 0
+
 
 def attach_error_data(error, in_runtime=False):
     """
@@ -103,7 +112,10 @@ class ErrorData(object):
 
         # Simplify error value to improve readability if error is raised in runtime
         if self.in_runtime:
-            self._simplify_error_value()
+            if int(
+                    os.getenv(SIMPLIFY_ERROR_ENV_NAME,
+                              DEFAULT_SIMPLIFY_NEW_ERROR)):
+                self._simplify_error_value()
             message_lines.append(str(self.error_value))
             return '\n'.join(message_lines)
 
@@ -150,3 +162,22 @@ class ErrorData(object):
 
         error_value_str = '\n'.join(error_value_lines)
         self.error_value = self.error_type(error_value_str)
+
+    def raise_new_exception(self):
+
+        # Raises the origin error if disable dygraph2static error module,
+        if int(os.getenv(DISABLE_ERROR_ENV_NAME, DEFAULT_DISABLE_NEW_ERROR)):
+            raise
+
+        new_exception = self.create_exception()
+        if six.PY3:
+            # NOTE(liym27):
+            # 1. Why `raise new_exception from None`?
+            #   In Python 3, by default, an new exception is raised with trace information of the caught exception.
+            #   This only raises new_exception and hides unwanted implementation details from tracebacks of the
+            #   caught exception.
+            # 2. Use exec to bypass syntax error checking in Python 2.
+
+            six.exec_("raise new_exception from None")
+        else:
+            raise new_exception
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
index 90e38bd98863ff62174bd569a483b11984480b5a..37ce8b0a152ff8e258e8aee2a54ed7215f77c146 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
@@ -135,6 +135,11 @@ class FunctionSpec(object):
 
             input_with_spec = pack_sequence_as(args, input_with_spec)
 
+        # If without specificing name in input_spec, add default name
+        # according to argument name from decorated function.
+        input_with_spec = replace_spec_empty_name(self._arg_names,
+                                                  input_with_spec)
+
         return input_with_spec
 
     @switch_to_static_graph
@@ -309,3 +314,61 @@ def convert_to_input_spec(inputs, input_spec):
         raise TypeError(
             "The type(input_spec) should be a `InputSpec` or dict/list/tuple of it, but received {}.".
             type_name(input_spec))
+
+
+def replace_spec_empty_name(args_name, input_with_spec):
+    """
+    Adds default name according to argument name from decorated function
+    if without specificing InputSpec.name
+
+    The naming rule are as followed:
+        1. If InputSpec.name is not None, do nothing.
+        2. If each argument `x` corresponds to an InputSpec, using the argument name like `x`
+        3. If the arguments `inputs` corresponds to a list(InputSpec), using name like `inputs_0`, `inputs_1`
+        4. If the arguments `input_dic` corresponds to a dict(InputSpec), using key as name.
+
+    For example:
+        
+        # case 1: foo(x, y)
+        foo = to_static(foo, input_spec=[InputSpec([None, 10]), InputSpec([None])])
+        print([in_var.name for in_var in foo.inputs])  # [x, y]
+
+        # case 2: foo(inputs) where inputs is a list
+        foo = to_static(foo, input_spec=[[InputSpec([None, 10]), InputSpec([None])]])
+        print([in_var.name for in_var in foo.inputs])  # [inputs_0, inputs_1]
+
+        # case 3: foo(inputs) where inputs is a dict
+        foo = to_static(foo, input_spec=[{'x': InputSpec([None, 10]), 'y': InputSpec([None])}])
+        print([in_var.name for in_var in foo.inputs])  # [x, y]
+    """
+    input_with_spec = list(input_with_spec)
+    candidate_arg_names = args_name[:len(input_with_spec)]
+
+    for i, arg_name in enumerate(candidate_arg_names):
+        input_spec = input_with_spec[i]
+        input_with_spec[i] = _replace_spec_name(arg_name, input_spec)
+
+    return input_with_spec
+
+
+def _replace_spec_name(name, input_spec):
+    """
+    Replaces InputSpec.name with given `name` while not specificing it.
+    """
+    if isinstance(input_spec, paddle.static.InputSpec):
+        if input_spec.name is None:
+            input_spec.name = name
+        return input_spec
+    elif isinstance(input_spec, (list, tuple)):
+        processed_specs = []
+        for i, spec in enumerate(input_spec):
+            new_name = "{}_{}".format(name, i)
+            processed_specs.append(_replace_spec_name(new_name, spec))
+        return processed_specs
+    elif isinstance(input_spec, dict):
+        processed_specs = {}
+        for key, spec in six.iteritems(input_spec):
+            processed_specs[key] = _replace_spec_name(key, spec)
+        return processed_specs
+    else:
+        return input_spec
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
index 13f38b0726c27566ff0eda41d6c365e6a7e4aa4b..76e732d4d37f6a2056afba72649077acf16ba30e 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
@@ -124,8 +124,13 @@ class OriginInfoAttacher(gast.NodeTransformer):
 
     def _abs_lineno(self, node):
         # NOTE(liym27):
-        #   If the first gast.FunctionDef has decorator, its lineno is 1, which
-        #   equals to the lineno of the first decorator node.
+        #   There are differences in ast_node.lineno between PY3.8+ and PY3.8-.
+        #   If the first gast.FunctionDef has decorator, the lineno of gast.FunctionDef is differs.
+        #       1. < PY3.8
+        #           its lineno equals to the lineno of the first decorator node, which is not right.
+        #       2. >= PY3.8
+        #           its lineno is the actual lineno, which is right.
+
         return self.lineno_offset + node.lineno
 
     def _abs_col_offset(self, node):
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index 3d27810f1db94c4f6c273399ec93b9335f5bb03a..e5fce3e6ede1511458f8da916165738d9e842d1a 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -32,8 +32,7 @@ from paddle.fluid.layers.utils import flatten
 from paddle.fluid.dygraph.base import param_guard
 from paddle.fluid.dygraph.base import switch_to_static_graph
 from paddle.fluid.dygraph.dygraph_to_static import DygraphToStaticAst
-from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA
-from paddle.fluid.dygraph.dygraph_to_static.error import attach_error_data
+from paddle.fluid.dygraph.dygraph_to_static import error
 from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.origin_info import attach_origin_info
 from paddle.fluid.dygraph.dygraph_to_static.origin_info import create_and_update_origin_info_map
@@ -315,6 +314,7 @@ class StaticLayer(object):
 
         # 2. trace ops from dygraph layers and cache the generated program.
         args, kwargs = self._function_spec.unified_args_and_kwargs(args, kwargs)
+
         try:
             concrete_program, partial_program_layer = self.get_concrete_program(
                 *args, **kwargs)
@@ -324,27 +324,22 @@ class StaticLayer(object):
                 partial_program_layer.training = self._class_instance.training
 
             # 4. return outputs.
-            return partial_program_layer(args)
+            try:
+                return partial_program_layer(args)
+            except Exception as e:
+                if not hasattr(e, error.ERROR_DATA):
+                    # runtime error
+                    error.attach_error_data(e, in_runtime=True)
+                    raise
         except Exception as e:
-            if not hasattr(e, ERROR_DATA):
-                # runtime error
-                attach_error_data(e, in_runtime=True)
-            error_data = getattr(e, ERROR_DATA, None)
+            error_data = getattr(e, error.ERROR_DATA, None)
             if error_data:
-                new_exception = error_data.create_exception()
-                if six.PY3:
-                    # NOTE(liym27):
-                    # 1. Why `raise new_exception from None`?
-                    #   In Python 3, by default, an new exception is raised with trace information of the caught exception.
-                    #   This only raises new_exception and hides unwanted implementation details from tracebacks of the
-                    #   caught exception.
-                    # 2. Use exec to bypass syntax error checking in Python 2.
-
-                    six.exec_("raise new_exception from None")
-                else:
-                    raise new_exception
+                error_data.raise_new_exception()
             else:
-                raise
+                logging_utils.warn(
+                    "Please file an issue at 'https://github.com/PaddlePaddle/Paddle/issues'"
+                    " if you can't handle this {} yourself.".format(type(e)))
+                raise e
 
     def _call_dygraph_function(self, *args, **kwargs):
         """
@@ -593,7 +588,7 @@ class ConcreteProgram(object):
                         outputs = static_func(*inputs)
                     except BaseException as e:
                         # NOTE: If e is raised in compile time, e should be attached to ERROR_DATA here.
-                        attach_error_data(e)
+                        error.attach_error_data(e)
                         raise
 
                 if not isinstance(outputs,
@@ -813,28 +808,36 @@ class ProgramTranslator(object):
                 "The ProgramTranslator.get_output doesn't work when setting ProgramTranslator.enable = False. "
                 "We will just return dygraph output.")
             return dygraph_func(*args, **kwargs)
-
-        function_spec = FunctionSpec(dygraph_func)
-        cache_key = CacheKey.from_func_and_args(function_spec, args, kwargs,
-                                                getattr(dygraph_func,
-                                                        '__self__', None))
-        _, partial_program_layer = self._program_cache[cache_key]
-
-        if args and isinstance(args[0], layers.Layer):
-            # Synchronize self.training attribute.
-            partial_program_layer.training = args[0].training
-            args = args[1:]
         try:
-            return partial_program_layer(args)
-
+            function_spec = FunctionSpec(dygraph_func)
+            cache_key = CacheKey.from_func_and_args(function_spec, args, kwargs,
+                                                    getattr(dygraph_func,
+                                                            '__self__', None))
+            _, partial_program_layer = self._program_cache[cache_key]
+
+            if args and isinstance(args[0], layers.Layer):
+                # Synchronize self.training attribute.
+                partial_program_layer.training = args[0].training
+                args = args[1:]
+            try:
+                return partial_program_layer(args)
+            except BaseException as e:
+                # NOTE:
+                # 1. If e is raised in compile time, e should have been attached to ERROR_DATA before;
+                # 2. If e raised in runtime, e should be attached to ERROR_DATA here.
+                if not hasattr(e, error.ERROR_DATA):
+                    # runtime error
+                    error.attach_error_data(e, in_runtime=True)
+                raise
         except BaseException as e:
-            # NOTE:
-            # 1. If e is raised in compile time, e should have been attached to ERROR_DATA before;
-            # 2. If e raised in runtime, e should be attached to ERROR_DATA here.
-            if not hasattr(e, ERROR_DATA):
-                # runtime error
-                attach_error_data(e, in_runtime=True)
-            raise
+            error_data = getattr(e, error.ERROR_DATA, None)
+            if error_data:
+                error_data.raise_new_exception()
+            else:
+                logging_utils.warn(
+                    "Please file an issue at 'https://github.com/PaddlePaddle/Paddle/issues'"
+                    " if you can't handle this {} yourself.".format(type(e)))
+                raise e
 
     def get_func(self, dygraph_func):
         """
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index 1d2ea142c7d5f2e653e446986a39d1bc155006f0..4391843b0efb5636104973f0524131aa64751ffa 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -19,6 +19,7 @@ import six
 import pickle
 import numpy as np
 
+import paddle
 from paddle import compat as cpt
 from paddle.fluid import core
 from paddle.fluid import framework
@@ -182,9 +183,9 @@ class _ProgramHolder(object):
         super(_ProgramHolder, self).__init__()
 
         # input, output, persistable var info
-        self._input_names = []
-        self._persistable_names = []
+        self._input_descs = []
         self._output_descs = []
+        self._persistable_names = []
 
         # execution scope
         self._inner_scope = core.Scope()
@@ -207,11 +208,11 @@ class _ProgramHolder(object):
         return self._train_program_desc
 
     @property
-    def input_names(self):
-        return self._input_names
+    def input_descs(self):
+        return self._input_descs
 
     @property
-    def output_decs(self):
+    def output_descs(self):
         return self._output_descs
 
     @property
@@ -233,7 +234,8 @@ class _ProgramHolder(object):
                 ops_to_remove.append(i)
                 feed_var_name = cpt.to_bytes(op.input('X')[0])
                 root_block._remove_var(feed_var_name)
-                self._input_names.append(cpt.to_bytes(op.output('Out')[0]))
+                self._input_descs.append(
+                    root_block.find_var(cpt.to_bytes(op.output('Out')[0])))
             elif op.type() == 'scale' and op.output('Out')[0].startswith(
                     'save_infer_model/scale_'):
                 ops_to_remove.append(i)
@@ -257,7 +259,7 @@ class _ProgramHolder(object):
             root_block._remove_op(op_idx, op_idx + 1)
 
         # 2. Input processing, reverse feed vars
-        self._input_names.reverse()
+        self._input_descs.reverse()
 
         # 3. Output processing, add scale for outputs
         tmp_program = _build_program_by_desc(program_desc)
@@ -479,11 +481,15 @@ def _load_persistable_vars(model_path,
             var_file_path = os.path.join(model_path, params_filename)
         else:
             var_file_path = os.path.join(model_path, VARIABLE_FILENAME)
-        framework._dygraph_tracer().trace_op(
-            type='load_combine',
-            inputs={},
-            outputs={'Out': load_var_list},
-            attrs={'file_path': var_file_path})
+        if not os.path.exists(var_file_path):
+            if len(extra_var_info) != 0:
+                raise ValueError("The model to be loaded is incomplete.")
+        else:
+            framework._dygraph_tracer().trace_op(
+                type='load_combine',
+                inputs={},
+                outputs={'Out': load_var_list},
+                attrs={'file_path': var_file_path})
 
     return load_var_dict
 
@@ -734,7 +740,7 @@ class TranslatedLayer(layers.Layer):
                 if isinstance(value, np.ndarray):
                     var = core.VarBase(
                         value=value,
-                        name=program_holder.input_names[i],
+                        name=program_holder.input_descs[i].name(),
                         persistable=False,
                         place=framework._current_expected_place(),
                         zero_copy=True)
@@ -742,7 +748,7 @@ class TranslatedLayer(layers.Layer):
                     var = value
                     # NOTE: we changed var name here, 
                     # but it may be an important name set by user
-                    var.name = program_holder.input_names[i]
+                    var.name = program_holder.input_descs[i].name()
                 input_vars.append(var)
 
             persistable_vars = []
@@ -758,7 +764,7 @@ class TranslatedLayer(layers.Layer):
                         % var_name)
 
             output_vars = []
-            for var_desc in program_holder.output_decs:
+            for var_desc in program_holder.output_descs:
                 var = core.VarBase(var_desc.dtype(),
                                    var_desc.shape(),
                                    var_desc.name(), var_desc.type(), False)
@@ -909,11 +915,7 @@ class TranslatedLayer(layers.Layer):
                 program = translated_layer.program()
         """
         # 1. get program holder
-        program_holder = self._program_holder_dict.get(method_name, None)
-        if program_holder is None:
-            raise ValueError(
-                "The method `%s` is not exists in loaded TranslatedLayer." %
-                method_name)
+        program_holder = self._get_program_holder(method_name)
 
         # 2. get inference program desc
         program_desc = program_holder.infer_program
@@ -921,3 +923,44 @@ class TranslatedLayer(layers.Layer):
         # 3. construct program
         program = _build_program_by_desc(program_desc)
         return program
+
+    def _get_program_holder(self, method_name='forward'):
+        program_holder = self._program_holder_dict.get(method_name, None)
+        if program_holder is None:
+            raise ValueError(
+                "The method `%s` does not exist in loaded TranslatedLayer." %
+                method_name)
+        return program_holder
+
+    def _input_spec(self, method_name='forward'):
+        # 1. get program holder
+        program_holder = self._get_program_holder(method_name)
+
+        # 2. build input spec by input desc
+        input_spec = []
+        for var_desc in program_holder.input_descs:
+            spec = paddle.static.InputSpec(
+                shape=var_desc.shape(),
+                dtype=var_desc.dtype(),
+                name=var_desc.name())
+            input_spec.append(spec)
+
+        return input_spec
+
+    def _output_spec(self, method_name='forward'):
+        # 1. get program holder
+        program_holder = self._get_program_holder(method_name)
+
+        # 2. build output spec by output desc
+        output_spec = []
+        for var_desc in program_holder.output_descs:
+            # NOTE(chenweihang): InputSpec describes a tensor, not just input. 
+            # Maybe the name is not good enough. Here we use InputSpec to 
+            # construct the description of Output tensor
+            spec = paddle.static.InputSpec(
+                shape=var_desc.shape(),
+                dtype=var_desc.dtype(),
+                name=var_desc.name())
+            output_spec.append(spec)
+
+        return output_spec
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 9f4ec2b55bc6b56fc796d3124edf1ec0deb3f23e..57864efec8a9447cca0be94f0f1b433c18435376 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -17,6 +17,8 @@ from __future__ import print_function
 import os
 import pickle
 import warnings
+import functools
+from collections import OrderedDict
 
 import six
 import paddle
@@ -36,7 +38,7 @@ from paddle.fluid.wrapped_decorator import wrap_decorator
 
 __all__ = [
     'TracedLayer', 'declarative', 'dygraph_to_static_func', 'set_code_level',
-    'set_verbosity'
+    'set_verbosity', 'save', 'load', 'SaveLoadConfig'
 ]
 
 
@@ -210,7 +212,16 @@ def declarative(function=None, input_spec=None):
 
     # for usage: `declarative(foo, ...)`
     if function is not None:
-        return decorated(function)
+        if isinstance(function, Layer):
+            if isinstance(function.forward, StaticLayer):
+                class_name = function.__class__.__name__
+                warnings.warn(
+                    "`{}.forward` has already been decorated somewhere. It will be redecorated to replace previous one.".
+                    format(class_name))
+            function.forward = decorated(function.forward)
+            return function
+        else:
+            return decorated(function)
 
     # for usage: `@declarative`
     return decorated
@@ -228,63 +239,60 @@ class SaveLoadConfig(object):
 
         .. code-block:: python
 
-            import numpy as np
-            import paddle.fluid as fluid
-            from paddle.fluid.dygraph import Linear
-            from paddle.fluid.dygraph import declarative
+            import paddle
+            import paddle.nn as nn
+            import paddle.optimizer as opt
 
-            class SimpleNet(fluid.dygraph.Layer):
+            class SimpleNet(nn.Layer):
                 def __init__(self, in_size, out_size):
                     super(SimpleNet, self).__init__()
-                    self._linear = Linear(in_size, out_size)
+                    self._linear = nn.Linear(in_size, out_size)
 
-                @declarative
+                @paddle.jit.to_static
                 def forward(self, x):
                     y = self._linear(x)
                     z = self._linear(y)
                     return z
 
             # enable dygraph mode
-            fluid.enable_dygraph() 
+            paddle.disable_static() 
 
             # train model
             net = SimpleNet(8, 8)
-            adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
-            x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+            adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
+            x = paddle.randn([4, 8], 'float32')
             for i in range(10):
                 out = net(x)
-                loss = fluid.layers.mean(out)
+                loss = paddle.tensor.mean(out)
                 loss.backward()
-                adam.minimize(loss)
-                net.clear_gradients()
+                adam.step()
+                adam.clear_grad()
 
             # use SaveLoadconfig when saving model
             model_path = "simplenet.example.model"
-            configs = fluid.dygraph.jit.SaveLoadConfig()
-            configs.model_filename = "__simplenet__"
-            fluid.dygraph.jit.save(
+            config = paddle.SaveLoadConfig()
+            config.model_filename = "__simplenet__"
+            paddle.jit.save(
                 layer=net,
                 model_path=model_path,
-                input_spec=[x],
-                configs=configs)
+                config=config)
 
         2. Using ``SaveLoadConfig`` when loading model
 
         .. code-block:: python
 
-            import numpy as np
-            import paddle.fluid as fluid
+            import paddle
 
             # enable dygraph mode
-            fluid.enable_dygraph() 
+            paddle.disable_static() 
 
             # use SaveLoadconfig when loading model
             model_path = "simplenet.example.model"
-            configs = fluid.dygraph.jit.SaveLoadConfig()
-            configs.model_filename = "__simplenet__"
-            infer_net = fluid.dygraph.jit.load(model_path, configs=configs)
+            config = paddle.SaveLoadConfig()
+            config.model_filename = "__simplenet__"
+            infer_net = paddle.jit.load(model_path, config=config)
             # inference
-            x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+            x = paddle.randn([4, 8], 'float32')
             pred = infer_net(x)
     """
 
@@ -324,51 +332,46 @@ class SaveLoadConfig(object):
         Examples:
             .. code-block:: python
 
-                import numpy as np
-                import paddle.fluid as fluid
-                from paddle.fluid.dygraph import Linear
-                from paddle.fluid.dygraph import declarative
+                import paddle
+                import paddle.nn as nn
+                import paddle.optimizer as opt
 
-                class SimpleNet(fluid.dygraph.Layer):
+                class SimpleNet(nn.Layer):
                     def __init__(self, in_size, out_size):
                         super(SimpleNet, self).__init__()
-                        self._linear = Linear(in_size, out_size)
+                        self._linear = nn.Linear(in_size, out_size)
 
-                    @declarative
+                    @paddle.jit.to_static
                     def forward(self, x):
                         y = self._linear(x)
                         z = self._linear(y)
-                        loss = fluid.layers.mean(z)
+                        loss = paddle.tensor.mean(z)
                         return z, loss
 
                 # enable dygraph mode
-                fluid.enable_dygraph() 
+                paddle.disable_static() 
 
                 # train model
                 net = SimpleNet(8, 8)
-                adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
-                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
+                x = paddle.randn([4, 8], 'float32')
                 for i in range(10):
                     out, loss = net(x)
                     loss.backward()
-                    adam.minimize(loss)
-                    net.clear_gradients()
+                    adam.step()
+                    adam.clear_grad()
 
                 # use SaveLoadconfig.output_spec
                 model_path = "simplenet.example.model.output_spec"
-                configs = fluid.dygraph.jit.SaveLoadConfig()
-                # only keep the predicted output in saved model, discard loss
-                configs.output_spec = [out]
-
-                fluid.dygraph.jit.save(
+                config = paddle.SaveLoadConfig()
+                config.output_spec = [out]
+                paddle.jit.save(
                     layer=net,
                     model_path=model_path,
-                    input_spec=[x],
-                    configs=configs)
+                    config=config)
 
-                infer_net = fluid.dygraph.jit.load(model_path, configs=configs)
-                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
-                # only have the predicted output
+                infer_net = paddle.jit.load(model_path)
+                x = paddle.randn([4, 8], 'float32')
                 pred = infer_net(x)
         """
         return self._output_spec
@@ -395,52 +398,47 @@ class SaveLoadConfig(object):
         Examples:
             .. code-block:: python
 
-                import numpy as np
-                import paddle.fluid as fluid
-                from paddle.fluid.dygraph import Linear
-                from paddle.fluid.dygraph import declarative
+                import paddle
+                import paddle.nn as nn
+                import paddle.optimizer as opt
 
-                class SimpleNet(fluid.dygraph.Layer):
+                class SimpleNet(nn.Layer):
                     def __init__(self, in_size, out_size):
                         super(SimpleNet, self).__init__()
-                        self._linear = Linear(in_size, out_size)
+                        self._linear = nn.Linear(in_size, out_size)
 
-                    @declarative
+                    @paddle.jit.to_static
                     def forward(self, x):
                         y = self._linear(x)
                         z = self._linear(y)
                         return z
 
                 # enable dygraph mode
-                fluid.enable_dygraph() 
+                paddle.disable_static() 
 
                 # train model
                 net = SimpleNet(8, 8)
-                adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
-                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
+                x = paddle.randn([4, 8], 'float32')
                 for i in range(10):
                     out = net(x)
-                    loss = fluid.layers.mean(out)
+                    loss = paddle.tensor.mean(out)
                     loss.backward()
-                    adam.minimize(loss)
-                    net.clear_gradients()
-
-                model_path = "simplenet.example.model.model_filename"
-                configs = fluid.dygraph.jit.SaveLoadConfig()
-                configs.model_filename = "__simplenet__"
+                    adam.step()
+                    adam.clear_grad()
 
                 # saving with configs.model_filename
-                fluid.dygraph.jit.save(
+                model_path = "simplenet.example.model.model_filename"
+                config = paddle.SaveLoadConfig()
+                config.model_filename = "__simplenet__"
+                paddle.jit.save(
                     layer=net,
                     model_path=model_path,
-                    input_spec=[x],
-                    configs=configs)
-                # [result] the saved model directory contains:
-                # __simplenet__  __variables__  __variables.info__
+                    config=config)
 
                 # loading with configs.model_filename
-                infer_net = fluid.dygraph.jit.load(model_path, configs=configs)
-                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                infer_net = paddle.jit.load(model_path, config=config)
+                x = paddle.randn([4, 8], 'float32')
                 pred = infer_net(x)
         """
         return self._model_filename
@@ -465,52 +463,48 @@ class SaveLoadConfig(object):
         Examples:
             .. code-block:: python
 
-                import numpy as np
-                import paddle.fluid as fluid
-                from paddle.fluid.dygraph import Linear
-                from paddle.fluid.dygraph import declarative
+                import paddle
+                import paddle.nn as nn
+                import paddle.optimizer as opt
 
-                class SimpleNet(fluid.dygraph.Layer):
+                class SimpleNet(nn.Layer):
                     def __init__(self, in_size, out_size):
                         super(SimpleNet, self).__init__()
-                        self._linear = Linear(in_size, out_size)
+                        self._linear = nn.Linear(in_size, out_size)
 
-                    @declarative
+                    @paddle.jit.to_static
                     def forward(self, x):
                         y = self._linear(x)
                         z = self._linear(y)
                         return z
 
                 # enable dygraph mode
-                fluid.enable_dygraph() 
+                paddle.disable_static() 
 
                 # train model
                 net = SimpleNet(8, 8)
-                adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
-                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
+                x = paddle.randn([4, 8], 'float32')
                 for i in range(10):
                     out = net(x)
-                    loss = fluid.layers.mean(out)
+                    loss = paddle.tensor.mean(out)
                     loss.backward()
-                    adam.minimize(loss)
-                    net.clear_gradients()
+                    adam.step()
+                    adam.clear_grad()
 
                 model_path = "simplenet.example.model.params_filename"
-                configs = fluid.dygraph.jit.SaveLoadConfig()
-                configs.params_filename = "__params__"
+                config = paddle.SaveLoadConfig()
+                config.params_filename = "__params__"
 
                 # saving with configs.params_filename
-                fluid.dygraph.jit.save(
+                paddle.jit.save(
                     layer=net,
                     model_path=model_path,
-                    input_spec=[x],
-                    configs=configs)
-                # [result] the saved model directory contains:
-                # __model__  __params__  __variables.info__
+                    config=config)
 
                 # loading with configs.params_filename
-                infer_net = fluid.dygraph.jit.load(model_path, configs=configs)
-                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                infer_net = paddle.jit.load(model_path, config=config)
+                x = paddle.randn([4, 8], 'float32')
                 pred = infer_net(x)
         """
         return self._params_filename
@@ -544,52 +538,50 @@ class SaveLoadConfig(object):
         Examples:
             .. code-block:: python
 
-                import numpy as np
-                import paddle.fluid as fluid
-                from paddle.fluid.dygraph import Linear
-                from paddle.fluid.dygraph import declarative
+                import paddle
+                import paddle.nn as nn
+                import paddle.optimizer as opt
 
-                class SimpleNet(fluid.dygraph.Layer):
+                class SimpleNet(nn.Layer):
                     def __init__(self, in_size, out_size):
                         super(SimpleNet, self).__init__()
-                        self._linear = Linear(in_size, out_size)
+                        self._linear = nn.Linear(in_size, out_size)
 
-                    @declarative
+                    @paddle.jit.to_static
                     def forward(self, x):
                         y = self._linear(x)
                         z = self._linear(y)
                         return z
 
                 # enable dygraph mode
-                fluid.enable_dygraph() 
+                paddle.disable_static() 
 
                 # train model
                 net = SimpleNet(8, 8)
-                adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
-                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
+                x = paddle.randn([4, 8], 'float32')
                 for i in range(10):
                     out = net(x)
-                    loss = fluid.layers.mean(out)
+                    loss = paddle.tensor.mean(out)
                     loss.backward()
-                    adam.minimize(loss)
-                    net.clear_gradients()
+                    adam.step()
+                    adam.clear_grad()
 
                 model_path = "simplenet.example.model.separate_params"
-                configs = fluid.dygraph.jit.SaveLoadConfig()
-                configs.separate_params = True
+                config = paddle.jit.SaveLoadConfig()
+                config.separate_params = True
 
                 # saving with configs.separate_params
-                fluid.dygraph.jit.save(
+                paddle.jit.save(
                     layer=net,
                     model_path=model_path,
-                    input_spec=[x],
-                    configs=configs)
+                    config=config)
                 # [result] the saved model directory contains:
                 # linear_0.b_0  linear_0.w_0  __model__  __variables.info__
 
                 # loading with configs.params_filename
-                infer_net = fluid.dygraph.jit.load(model_path, configs=configs)
-                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                infer_net = paddle.jit.load(model_path, config=config)
+                x = paddle.randn([4, 8], 'float32')
                 pred = infer_net(x)
         """
         return self._separate_params
@@ -651,8 +643,88 @@ class SaveLoadConfig(object):
         self._keep_name_table = value
 
 
+def _get_input_var_names(inputs, input_spec):
+    name_none_error = "The %s's name is None. " \
+        "When using jit.save, please set InputSepc's name in " \
+        "to_static(input_spec=[]) and jit.save(input_spec=[]) " \
+        "and make sure they are consistent."
+    name_no_exists_error = "The tensor `%s` does not exists. " \
+        "Please make sure the name of InputSpec or example Tensor " \
+        "in input_spec is the same as the name of InputSpec in " \
+        "`to_static` decorated on the Layer.forward method."
+    result_list = []
+    input_var_names = [var.name for var in inputs if isinstance(var, Variable)]
+    if input_spec is None:
+        # no prune
+        result_list = input_var_names
+    elif input_spec is not None and len(input_spec) == len(input_var_names):
+        # no prune
+        result_list = input_var_names
+        # if input spec name not in input_var_names, only raise warning 
+        for spec in input_spec:
+            if spec.name is None:
+                warnings.warn(name_none_error % spec)
+            elif spec.name not in input_var_names:
+                warnings.warn(name_no_exists_error % spec.name)
+            else:
+                # do nothing
+                pass
+    else:
+        # prune
+        for spec in input_spec:
+            if spec.name is None:
+                # name is None, the input_spec only can be InputSpec
+                raise ValueError(name_none_error % spec)
+            elif spec.name not in input_var_names:
+                # the input_spec can be `InputSpec` or `VarBase`
+                raise ValueError(name_no_exists_error % spec.name)
+            else:
+                result_list.append(spec.name)
+
+    return result_list
+
+
+def _get_output_vars(outputs, output_spec):
+    name_no_exists_error = "The tensor `%s` does not exists. " \
+        "Please make sure the name of example Tensor " \
+        "in configs.output_spec is the output tensor of " \
+        "Layer.forward method."
+    result_list = []
+    output_vars_dict = OrderedDict()
+    for var in outputs:
+        if isinstance(var, Variable):
+            output_vars_dict[var.name] = var
+    if output_spec is None:
+        result_list = output_vars_dict.values()
+    elif output_spec is not None and len(output_spec) == len(output_vars_dict):
+        result_list = output_vars_dict.values()
+        for var in output_spec:
+            if var.name not in output_vars_dict:
+                warnings.warn(name_no_exists_error % var.name)
+    else:
+        for var in output_spec:
+            if var.name not in output_vars_dict:
+                raise ValueError(name_no_exists_error % var.name)
+            else:
+                result_list.append(output_vars_dict[var.name])
+    return result_list
+
+
+# NOTE(chenweihang): change jit.save/load argument `configs` to `config`
+def deprecate_save_load_configs(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if 'configs' in kwargs:
+            kwargs['config'] = kwargs['configs']
+            kwargs.pop('configs')
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+@deprecate_save_load_configs
 @switch_to_static_graph
-def save(layer, model_path, input_spec=None, configs=None):
+def save(layer, model_path, input_spec=None, config=None):
     """
     Saves input declarative Layer as :ref:`api_imperative_TranslatedLayer` 
     format model, which can be used for inference or fine-tuning after loading.
@@ -677,7 +749,7 @@ def save(layer, model_path, input_spec=None, configs=None):
             It is the example inputs that will be passed to saved TranslatedLayer's forward
             function. If None, all input variables of the original Layer's forward function
             would be the inputs of the saved model. Default None.
-        configs (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig` object
+        config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig` object
             that specifies additional configuration options. Default None.
     Returns:
         None
@@ -686,87 +758,78 @@ def save(layer, model_path, input_spec=None, configs=None):
         .. code-block:: python
 
             import numpy as np
-            import paddle.fluid as fluid
-            from paddle.fluid.dygraph import Linear
-            from paddle.fluid.dygraph import declarative
+            import paddle
+            import paddle.nn as nn
+            import paddle.optimizer as opt
 
-            BATCH_SIZE = 32
-            BATCH_NUM = 20
+            BATCH_SIZE = 16
+            BATCH_NUM = 4
+            EPOCH_NUM = 4
 
-            def random_batch_reader():
-                def _get_random_images_and_labels(image_shape, label_shape):
-                    image = np.random.random(size=image_shape).astype('float32')
-                    label = np.random.random(size=label_shape).astype('int64')
-                    return image, label
+            IMAGE_SIZE = 784
+            CLASS_NUM = 10
 
-                def __reader__():
-                    for _ in range(BATCH_NUM):
-                        batch_image, batch_label = _get_random_images_and_labels(
-                            [BATCH_SIZE, 784], [BATCH_SIZE, 1])
-                        yield batch_image, batch_label
+            # define a random dataset
+            class RandomDataset(paddle.io.Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
 
-                return __reader__
+                def __getitem__(self, idx):
+                    image = np.random.random([IMAGE_SIZE]).astype('float32')
+                    label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+                    return image, label
 
-            class LinearNet(fluid.dygraph.Layer):
-                def __init__(self, in_size, out_size):
+                def __len__(self):
+                    return self.num_samples
+
+            class LinearNet(nn.Layer):
+                def __init__(self):
                     super(LinearNet, self).__init__()
-                    self._linear = Linear(in_size, out_size)
+                    self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
 
-                @declarative
+                @paddle.jit.to_static
                 def forward(self, x):
                     return self._linear(x)
 
+            def train(layer, loader, loss_fn, opt):
+                for epoch_id in range(EPOCH_NUM):
+                    for batch_id, (image, label) in enumerate(loader()):
+                        out = layer(image)
+                        loss = loss_fn(out, label)
+                        loss.backward()
+                        opt.step()
+                        opt.clear_grad()
+                        print("Epoch {} batch {}: loss = {}".format(
+                            epoch_id, batch_id, np.mean(loss.numpy())))
+
             # enable dygraph mode
-            fluid.enable_dygraph() 
+            place = paddle.CPUPlace()
+            paddle.disable_static(place) 
 
-            # create network
-            net = LinearNet(784, 1)
-            adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
-            # create data loader
-            train_loader = fluid.io.DataLoader.from_generator(capacity=5)
-            train_loader.set_batch_generator(random_batch_reader())
-            # train
-            for data in train_loader():
-                img, label = data
-                label.stop_gradient = True
+            # 1. train & save model.
 
-                cost = net(img)
+            # create network
+            layer = LinearNet()
+            loss_fn = nn.CrossEntropyLoss()
+            adam = opt.Adam(learning_rate=0.001, parameters=layer.parameters())
 
-                loss = fluid.layers.cross_entropy(cost, label)
-                avg_loss = fluid.layers.mean(loss)
+            # create data loader
+            dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+            loader = paddle.io.DataLoader(dataset,
+                places=place,
+                batch_size=BATCH_SIZE,
+                shuffle=True,
+                drop_last=True,
+                num_workers=2)
 
-                avg_loss.backward()
-                adam.minimize(avg_loss)
-                net.clear_gradients()
+            # train
+            train(layer, loader, loss_fn, adam)
 
-            # save model
+            # save
             model_path = "linear.example.model"
-            fluid.dygraph.jit.save(
-                layer=net,
-                model_path=model_path,
-                input_spec=[img])
+            paddle.jit.save(layer, model_path)
     """
 
-    def get_inout_spec(all_vars, target_vars, return_name=False):
-        result_list = []
-        valid_var_dict = {}
-        valid_vars = [var for var in all_vars if isinstance(var, Variable)]
-        for var in valid_vars:
-            valid_var_dict[var.name] = var
-        if target_vars:
-            for i, var in enumerate(target_vars):
-                # check target var whether exists
-                if var.name not in valid_var_dict:
-                    raise RuntimeError(
-                        "The variable to feed/fetch are not exist.")
-                result_list.append(valid_var_dict[var.name])
-        else:
-            result_list = valid_vars
-        if return_name:
-            result_list = [var.name for var in result_list]
-
-        return result_list
-
     # 1. input check
     prog_translator = ProgramTranslator()
     if not prog_translator.enable:
@@ -778,28 +841,62 @@ def save(layer, model_path, input_spec=None, configs=None):
             "The input layer of paddle.jit.save should be 'Layer', but received layer type is %s."
             % type(layer))
 
+    configs = config
     if configs is None:
         configs = SaveLoadConfig()
 
+    # avoid change user given input_spec
+    inner_input_spec = None
     if input_spec is not None:
         if not isinstance(input_spec, list):
             raise TypeError(
                 "The input input_spec should be 'list', but received input_spec's type is %s."
                 % type(input_spec))
+        inner_input_spec = []
         for var in input_spec:
-            if not isinstance(var, (core.VarBase, Variable,
-                                    paddle.static.InputSpec)):
+            if isinstance(var, paddle.static.InputSpec):
+                inner_input_spec.append(var)
+            elif isinstance(var, (core.VarBase, Variable)):
+                inner_input_spec.append(
+                    paddle.static.InputSpec.from_tensor(var))
+            else:
                 raise TypeError(
                     "The element in input_spec list should be 'Variable' or `paddle.static.InputSpec`, but received element's type is %s."
                     % type(var))
 
-    # 2. get program of declarative Layer.forward
-    if not isinstance(layer.forward, StaticLayer):
-        raise RuntimeError(
-            "layer.forward need to be decorated by `@declarative`.")
-    concrete_program = layer.forward.concrete_program
-
-    # NOTE: we maintain the mapping of variable name to
+    # 2. get program from Layer
+    # TODO(chenweihang): add support for other method, not only forward
+    if isinstance(layer.forward, StaticLayer):
+        concrete_program = layer.forward.concrete_program
+    else:
+        # transform in jit.save, if input_spec is incomplete, declarative will throw error
+        static_forward = declarative(layer.forward, input_spec=inner_input_spec)
+        concrete_program = static_forward.concrete_program
+        # the input_spec has been used in declarative, which is equal to 
+        # @declarative with input_spec and jit.save without input_spec,
+        # avoid needless warning
+        inner_input_spec = None
+
+    # 3. build input & output of save_infernece_model
+    # NOTE(chenweihang): [ Get input variables name ]
+    # There are two cases, whether to prune the inputs or not
+    # - not prune inputs (recommend):
+    #   - the len(input_spec) == len((concrete_program.inputs) - 1
+    #   - here can use concrete_program.inputs directly
+    # - prune inputs:
+    #   - the input_spec length < len((concrete_program.inputs) - 1
+    #   - the input_spec's name should be in concrete_program.inputs
+    input_var_names = _get_input_var_names(concrete_program.inputs,
+                                           inner_input_spec)
+
+    # NOTE(chenweihang): [ Get output variables ]
+    # the rule is like [ Get input variables name ]. For output var, 
+    # we only support VarBase spec, and actually, we only need the 
+    # var name of output, and we don't recommended to use output_spec
+    output_vars = _get_output_vars(concrete_program.outputs,
+                                   configs.output_spec)
+
+    # NOTE(chenweihang): we maintain the mapping of variable name to
     # structured name, the buffer variable (non-persistable)
     # saved to inference program may not need by dygraph Layer, 
     # we only record the state_dict variable's structured name
@@ -807,7 +904,7 @@ def save(layer, model_path, input_spec=None, configs=None):
     for structured_name, var in six.iteritems(layer.state_dict()):
         state_names_dict[var.name] = structured_name
 
-    # 3. share parameters from Layer to scope & record var info
+    # 4. share parameters from Layer to scope & record var info
     scope = core.Scope()
     extra_var_info = dict()
     for param_or_buffer in concrete_program.parameters:
@@ -825,10 +922,6 @@ def save(layer, model_path, input_spec=None, configs=None):
             extra_info_dict['trainable'] = param_or_buffer.trainable
         extra_var_info[param_or_buffer.name] = extra_info_dict
 
-    # 4. build input & output spec
-    input_var_names = get_inout_spec(concrete_program.inputs, input_spec, True)
-    output_vars = get_inout_spec(concrete_program.outputs, configs.output_spec)
-
     # 5. save inference model
     from paddle.fluid.io import save_inference_model
 
@@ -849,7 +942,7 @@ def save(layer, model_path, input_spec=None, configs=None):
             export_for_deployment=configs._export_for_deployment,
             program_only=configs._program_only)
 
-        # NOTE: [ Save extra variable info ]
+        # NOTE(chenweihang): [ Save extra variable info ]
         # save_inference_model will lose some important variable information, including:
         #   - Variable name and correspondence (when saved variables as one file)
         #   - Variable.stop_gradient information
@@ -869,8 +962,9 @@ def save(layer, model_path, input_spec=None, configs=None):
             pickle.dump(extra_var_info, f, protocol=2)
 
 
+@deprecate_save_load_configs
 @dygraph_only
-def load(model_path, configs=None):
+def load(model_path, config=None):
     """
     :api_attr: imperative
 
@@ -887,7 +981,7 @@ def load(model_path, configs=None):
 
     Args:
         model_path (str): The directory path where the model is saved.
-        configs (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig` object that specifies 
+        config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig` object that specifies 
             additional configuration options. Default None.
 
     Returns:
@@ -899,122 +993,126 @@ def load(model_path, configs=None):
         .. code-block:: python
 
             import numpy as np
-            import paddle.fluid as fluid
-            from paddle.fluid.dygraph import Linear
-            from paddle.fluid.dygraph import declarative
+            import paddle
+            import paddle.nn as nn
+            import paddle.optimizer as opt
 
-            BATCH_SIZE = 32
-            BATCH_NUM = 20
+            BATCH_SIZE = 16
+            BATCH_NUM = 4
+            EPOCH_NUM = 4
 
-            def random_batch_reader():
-                def _get_random_images_and_labels(image_shape, label_shape):
-                    image = np.random.random(size=image_shape).astype('float32')
-                    label = np.random.random(size=label_shape).astype('int64')
-                    return image, label
+            IMAGE_SIZE = 784
+            CLASS_NUM = 10
 
-                def __reader__():
-                    for _ in range(BATCH_NUM):
-                        batch_image, batch_label = _get_random_images_and_labels(
-                            [BATCH_SIZE, 784], [BATCH_SIZE, 1])
-                        yield batch_image, batch_label
+            # define a random dataset
+            class RandomDataset(paddle.io.Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
 
-                return __reader__
+                def __getitem__(self, idx):
+                    image = np.random.random([IMAGE_SIZE]).astype('float32')
+                    label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+                    return image, label
 
-            class LinearNet(fluid.dygraph.Layer):
-                def __init__(self, in_size, out_size):
+                def __len__(self):
+                    return self.num_samples
+
+            class LinearNet(nn.Layer):
+                def __init__(self):
                     super(LinearNet, self).__init__()
-                    self._linear = Linear(in_size, out_size)
+                    self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
 
-                @declarative
+                @paddle.jit.to_static
                 def forward(self, x):
                     return self._linear(x)
 
+            def train(layer, loader, loss_fn, opt):
+                for epoch_id in range(EPOCH_NUM):
+                    for batch_id, (image, label) in enumerate(loader()):
+                        out = layer(image)
+                        loss = loss_fn(out, label)
+                        loss.backward()
+                        opt.step()
+                        opt.clear_grad()
+                        print("Epoch {} batch {}: loss = {}".format(
+                            epoch_id, batch_id, np.mean(loss.numpy())))
+
             # enable dygraph mode
-            fluid.enable_dygraph() 
+            place = paddle.CPUPlace()
+            paddle.disable_static(place) 
 
             # 1. train & save model.
+
             # create network
-            net = LinearNet(784, 1)
-            adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
+            layer = LinearNet()
+            loss_fn = nn.CrossEntropyLoss()
+            adam = opt.Adam(learning_rate=0.001, parameters=layer.parameters())
+
             # create data loader
-            train_loader = fluid.io.DataLoader.from_generator(capacity=5)
-            train_loader.set_batch_generator(random_batch_reader())
-            # train
-            for data in train_loader():
-                img, label = data
-                label.stop_gradient = True
+            dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+            loader = paddle.io.DataLoader(dataset,
+                places=place,
+                batch_size=BATCH_SIZE,
+                shuffle=True,
+                drop_last=True,
+                num_workers=2)
 
-                cost = net(img)
+            # train
+            train(layer, loader, loss_fn, adam)
 
-                loss = fluid.layers.cross_entropy(cost, label)
-                avg_loss = fluid.layers.mean(loss)
+            # save
+            model_path = "linear.example.model"
+            paddle.jit.save(layer, model_path)
 
-                avg_loss.backward()
-                adam.minimize(avg_loss)
-                net.clear_gradients()
+            # 2. load model
 
-            model_path = "linear.example.model"
-            fluid.dygraph.jit.save(
-                layer=net,
-                model_path=model_path,
-                input_spec=[img])
+            # load
+            loaded_layer = paddle.jit.load(model_path)
 
-            # 2. load model & inference
-            # load model
-            infer_net = fluid.dygraph.jit.load(model_path)
             # inference
-            x = fluid.dygraph.to_variable(np.random.random((1, 784)).astype('float32'))
-            pred = infer_net(x)
+            loaded_layer.eval()
+            x = paddle.randn([1, IMAGE_SIZE], 'float32')
+            pred = loaded_layer(x)
 
-            # 3. load model & fine-tune
-            # load model
-            train_net = fluid.dygraph.jit.load(model_path)
-            train_net.train()
-            adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=train_net.parameters())
-            # create data loader
-            train_loader = fluid.io.DataLoader.from_generator(capacity=5)
-            train_loader.set_batch_generator(random_batch_reader())
             # fine-tune
-            for data in train_loader():
-                img, label = data
-                label.stop_gradient = True
+            loaded_layer.train()
+            adam = opt.Adam(learning_rate=0.001, parameters=loaded_layer.parameters())
+            train(loaded_layer, loader, loss_fn, adam)
 
-                cost = train_net(img)
-
-                loss = fluid.layers.cross_entropy(cost, label)
-                avg_loss = fluid.layers.mean(loss)
-
-                avg_loss.backward()
-                adam.minimize(avg_loss)
-                train_net.clear_gradients()
 
         2. Load model saved by :ref:`api_fluid_io_save_inference_model` then performing and fine-tune training.
 
         .. code-block:: python
 
             import numpy as np
+            import paddle
             import paddle.fluid as fluid
+            import paddle.nn as nn
+            import paddle.optimizer as opt
 
-            BATCH_SIZE = 32
-            BATCH_NUM = 20
+            BATCH_SIZE = 16
+            BATCH_NUM = 4
+            EPOCH_NUM = 4
 
-            def random_batch_reader():
-                def _get_random_images_and_labels(image_shape, label_shape):
-                    image = np.random.random(size=image_shape).astype('float32')
-                    label = np.random.random(size=label_shape).astype('int64')
-                    return image, label
+            IMAGE_SIZE = 784
+            CLASS_NUM = 10
 
-                def __reader__():
-                    for _ in range(BATCH_NUM):
-                        batch_image, batch_label = _get_random_images_and_labels(
-                            [BATCH_SIZE, 784], [BATCH_SIZE, 1])
-                        yield batch_image, batch_label
+            # define a random dataset
+            class RandomDataset(paddle.io.Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
+
+                def __getitem__(self, idx):
+                    image = np.random.random([IMAGE_SIZE]).astype('float32')
+                    label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+                    return image, label
 
-                return __reader__
+                def __len__(self):
+                    return self.num_samples
 
-            img = fluid.data(name='img', shape=[None, 784], dtype='float32')
+            image = fluid.data(name='image', shape=[None, 784], dtype='float32')
             label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-            pred = fluid.layers.fc(input=img, size=10, act='softmax')
+            pred = fluid.layers.fc(input=image, size=10, act='softmax')
             loss = fluid.layers.cross_entropy(input=pred, label=label)
             avg_loss = fluid.layers.mean(loss)
 
@@ -1025,9 +1123,15 @@ def load(model_path, configs=None):
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
 
-            loader = fluid.io.DataLoader.from_generator(
-                feed_list=[img, label], capacity=5, iterable=True)
-            loader.set_batch_generator(random_batch_reader(), places=place)
+            # create data loader
+            dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+            loader = paddle.io.DataLoader(dataset,
+                feed_list=[image, label],
+                places=place,
+                batch_size=BATCH_SIZE, 
+                shuffle=True,
+                drop_last=True,
+                num_workers=2)
 
             # 1. train and save inference model
             for data in loader():
@@ -1038,39 +1142,42 @@ def load(model_path, configs=None):
 
             model_path = "fc.example.model"
             fluid.io.save_inference_model(
-                model_path, ["img"], [pred], exe)
+                model_path, ["image"], [pred], exe)
+
+            # 2. load model
 
             # enable dygraph mode
-            fluid.enable_dygraph() 
+            paddle.disable_static(place)
+
+            # load
+            fc = paddle.jit.load(model_path)
 
-            # 2. load model & inference
-            fc = fluid.dygraph.jit.load(model_path)
-            x = fluid.dygraph.to_variable(np.random.random((1, 784)).astype('float32'))
+            # inference
+            fc.eval()
+            x = paddle.randn([1, IMAGE_SIZE], 'float32')
             pred = fc(x)
 
-            # 3. load model & fine-tune
-            fc = fluid.dygraph.jit.load(model_path)
+            # fine-tune
             fc.train()
-            sgd = fluid.optimizer.SGD(learning_rate=0.001,
-                                        parameter_list=fc.parameters())
-
-            train_loader = fluid.io.DataLoader.from_generator(capacity=5)
-            train_loader.set_batch_generator(
-                random_batch_reader(), places=place)
-
-            for data in train_loader():
-                img, label = data
-                label.stop_gradient = True
-
-                cost = fc(img)
-
-                loss = fluid.layers.cross_entropy(cost, label)
-                avg_loss = fluid.layers.mean(loss)
-
-                avg_loss.backward()
-                sgd.minimize(avg_loss)
+            loss_fn = nn.CrossEntropyLoss()
+            adam = opt.Adam(learning_rate=0.001, parameters=fc.parameters())
+            loader = paddle.io.DataLoader(dataset,
+                places=place,
+                batch_size=BATCH_SIZE,
+                shuffle=True,
+                drop_last=True,
+                num_workers=2)
+            for epoch_id in range(EPOCH_NUM):
+                for batch_id, (image, label) in enumerate(loader()):
+                    out = fc(image)
+                    loss = loss_fn(out, label)
+                    loss.backward()
+                    adam.step()
+                    adam.clear_grad()
+                    print("Epoch {} batch {}: loss = {}".format(
+                        epoch_id, batch_id, np.mean(loss.numpy())))
     """
-    return TranslatedLayer._construct(model_path, configs)
+    return TranslatedLayer._construct(model_path, config)
 
 
 @dygraph_only
diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index 8c4109674200bf97354444f92f00b13e053152a0..3aa7b9dfc262810686319819f717f3cfd06b5e50 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -19,7 +19,6 @@ from ..framework import Variable, convert_np_dtype_to_dtype_, _varbase_creator
 from ..layers.layer_function_generator import OpProtoHolder
 from ..layers import common_methods
 from . import to_variable, no_grad
-import paddle
 
 import numpy as np
 import six
@@ -163,26 +162,6 @@ def monkey_patch_math_varbase():
     def _scalar_div_(var, value):
         return _scalar_elementwise_op_(var, 1.0 / value, 0.0)
 
-    # TODO(shenliang03):  currently, it supports divide, floor_divide, remainder
-    # for binary operator by using the api to achieve the type promotion
-    def _binary_method_creator_(op_type, reverse=False):
-        import paddle
-
-        def __impl__(self, other_var):
-            import paddle
-            op = getattr(paddle, op_type)
-            if reverse:
-                return op(other_var, self)
-            else:
-                return op(self, other_var)
-
-        __impl__.__doc__ = """
-
-        See paddle.{}""".format(op_type)
-        __impl__.__name__ = op_type
-
-        return __impl__
-
     # for binary operator such as elementwise, compare
     def _binary_creator_(method_name,
                          op_type,
@@ -281,20 +260,22 @@ def monkey_patch_math_varbase():
         ## a*b == b*a. Do not need to reverse explicitly
         ('__rmul__',
          _binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)),
+        ('__div__', _binary_creator_('__div__', 'elementwise_div', False,
+                                     _scalar_div_)),
+        ('__truediv__', _binary_creator_('__truediv__', 'elementwise_div',
+                                         False, _scalar_div_)),
+        ('__rdiv__', _binary_creator_('__rdiv__', 'elementwise_div', True,
+                                      None)),
         ('__rtruediv__', _binary_creator_('rtruediv__', 'elementwise_div', True,
                                           None)),
         ('__pow__', _binary_creator_('__pow__', 'elementwise_pow', False,
                                      None)),
         ('__rpow__', _binary_creator_('__rpow__', 'elementwise_pow', True,
                                       None)),
-        # These binary use paddle.optype
-        ('__div__', _binary_method_creator_('divide', False)),
-        ('__truediv__', _binary_method_creator_('divide', False)),
-        ('__rtruediv__', _binary_method_creator_('divide', True)),
-        ('__rdiv__', _binary_method_creator_('divide', True)),
-        ('__floordiv__', _binary_method_creator_('floor_divide', False)),
-        ('__rfloordiv__', _binary_method_creator_('floor_divide', True)),
-        ('__mod__', _binary_method_creator_('remainder', False)),
+        ('__floordiv__', _binary_creator_('__floordiv__',
+                                          'elementwise_floordiv', False, None)),
+        ('__mod__', _binary_creator_('__mod__', 'elementwise_mod', False,
+                                     None)),
         ## for logical compare
         ('__eq__', _binary_creator_('__eq__', 'equal', False, None)),
         ('__ne__', _binary_creator_('__ne__', 'not_equal', False, None)),
@@ -304,7 +285,7 @@ def monkey_patch_math_varbase():
         ('__ge__', _binary_creator_('__ge__', 'greater_equal', False, None)),
         ('__array_ufunc__', None),
         ('sigmoid', _method_creator_('sigmoid', 'name=None')),
-        ('logsigmoid', _method_creator_('logsigmoid', 'name=None')),
+        ('log_sigmoid', _method_creator_('logsigmoid', 'name=None')),
         ('exp', _method_creator_('exp', 'name=None')),
         ('tanh', _method_creator_('tanh', 'name=None')),
         ('atan', _method_creator_('atan', 'name=None')),
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 5281df9ead10acea5ae8656dcc4a0eed14fb3e83..797b32f5d4768af59fa4e6aceb75e4b6d9029d91 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -217,7 +217,7 @@ def _dygraph_not_support_(func):
 def _dygraph_only_(func):
     def __impl__(*args, **kwargs):
         assert in_dygraph_mode(
-        ), "We Only support %s in imperative mode, please use fluid.dygraph.guard() as context to run it in imperative Mode" % func.__name__
+        ), "We Only support %s in dynamic mode, please call 'paddle.disable_static()' to enter dynamic mode." % func.__name__
         return func(*args, **kwargs)
 
     return __impl__
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/vars_metatools.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/vars_metatools.py
index c8f3643b25be0780bbdfd1668d849ab00ece355c..c80b4a800bd149cced13c0e25322cd03ef94e468 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/vars_metatools.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/vars_metatools.py
@@ -12,9 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import print_function
+from functools import reduce
+
 from paddle.fluid.framework import Variable
 from paddle.fluid import core
 
+dtype_to_size = {
+    core.VarDesc.VarType.FP16: 2,
+    core.VarDesc.VarType.FP32: 4,
+    core.VarDesc.VarType.FP64: 8,
+    core.VarDesc.VarType.INT16: 2,
+    core.VarDesc.VarType.INT32: 4,
+    core.VarDesc.VarType.INT64: 8,
+    core.VarDesc.VarType.BOOL: 1,
+    core.VarDesc.VarType.UINT8: 1,
+}
+
 
 class VarBlock:
     def __init__(self, varname, offset, size):
@@ -51,11 +64,14 @@ class VarStruct(object):
         self.type = type
         self.lod_level = lod_level
         self.persistable = persistable
+        self.m_size = 1
+        self.m_size = reduce(lambda x, y: x * y, shape)
+        self.m_size *= dtype_to_size[dtype]
 
     def __str__(self):
-        return "N: {}, S: {}, D: {}, T: {}, LL: {}, P: {}".format(
+        return "N: {}, S: {}, D: {}, T: {}, LL: {}, P: {}, M: {}".format(
             self.name, self.shape, self.dtype, self.type, self.lod_level,
-            self.persistable)
+            self.persistable, self.m_size)
 
 
 class VarDistributed(object):
diff --git a/python/paddle/fluid/inference/__init__.py b/python/paddle/fluid/inference/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3013c1f2aff87fb293ea984c99d8336b418ee080
--- /dev/null
+++ b/python/paddle/fluid/inference/__init__.py
@@ -0,0 +1,17 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .wrapper import Config, DataType, PlaceType, PrecisionType, Tensor, Predictor
+
+from ..core import create_predictor, get_version, get_num_bytes_of_data_type, PredictorPool
diff --git a/python/paddle/fluid/inference/wrapper.py b/python/paddle/fluid/inference/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..96885edcc5e822beb5db8332f2b58d12b9c4ff63
--- /dev/null
+++ b/python/paddle/fluid/inference/wrapper.py
@@ -0,0 +1,23 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..core import AnalysisConfig, PaddleDType, PaddlePlace
+from ..core import PaddleInferPredictor, PaddleInferTensor
+
+DataType = PaddleDType
+PlaceType = PaddlePlace
+PrecisionType = AnalysisConfig.Precision
+Config = AnalysisConfig
+Tensor = PaddleInferTensor
+Predictor = PaddleInferPredictor
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index ea6abe2d335e6669b27ba278c0faaca62ca0fdbb..bf87d1fc5a947e48845a3783fd71922641e28819 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -20,7 +20,8 @@ from __future__ import print_function
 from .layer_function_generator import generate_layer_fn
 from .layer_function_generator import autodoc, templatedoc
 from ..layer_helper import LayerHelper
-from ..framework import Variable
+from ..framework import Variable, in_dygraph_mode
+from .. import core
 from .loss import softmax_with_cross_entropy
 from . import tensor
 from . import nn
@@ -2893,8 +2894,8 @@ def generate_proposals(scores,
                        nms_thresh=0.5,
                        min_size=0.1,
                        eta=1.0,
-                       name=None,
-                       return_rois_num=False):
+                       return_rois_num=False,
+                       name=None):
     """
 	:alias_main: paddle.nn.functional.generate_proposals
 	:alias: paddle.nn.functional.generate_proposals,paddle.nn.functional.vision.generate_proposals
@@ -2949,6 +2950,10 @@ def generate_proposals(scores,
             num of each image in one batch. The N is the image's num. For example, the tensor has values [4,5] that represents
             the first image has 4 Rois, the second image has 5 Rois. It only used in rcnn model. 
             'False' by default. 
+        name(str, optional): For detailed information, please refer 
+            to :ref:`api_guide_Name`. Usually name is no need to set and 
+            None by default. 
+
     Returns:
         tuple:
         A tuple with format ``(rpn_rois, rpn_roi_probs)``.
@@ -2969,6 +2974,14 @@ def generate_proposals(scores,
                          im_info, anchors, variances)
 
     """
+    if in_dygraph_mode():
+        assert return_rois_num, "return_rois_num should be True in dygraph mode."
+        attrs = ('pre_nms_topN', pre_nms_top_n, 'post_nms_topN', post_nms_top_n,
+                 'nms_thresh', nms_thresh, 'min_size', min_size, 'eta', eta)
+        rpn_rois, rpn_roi_probs, rpn_rois_num = core.ops.generate_proposals(
+            scores, bbox_deltas, im_info, anchors, variances, *attrs)
+        return rpn_rois, rpn_roi_probs, rpn_rois_num
+
     helper = LayerHelper('generate_proposals', **locals())
 
     check_variable_and_dtype(scores, 'scores', ['float32'],
@@ -2986,7 +2999,14 @@ def generate_proposals(scores,
         dtype=bbox_deltas.dtype)
     rpn_roi_probs = helper.create_variable_for_type_inference(
         dtype=scores.dtype)
-    rpn_rois_lod = helper.create_variable_for_type_inference(dtype='int32')
+    outputs = {
+        'RpnRois': rpn_rois,
+        'RpnRoiProbs': rpn_roi_probs,
+    }
+    if return_rois_num:
+        rpn_rois_num = helper.create_variable_for_type_inference(dtype='int32')
+        rpn_rois_num.stop_gradient = True
+        outputs['RpnRoisNum'] = rpn_rois_num
 
     helper.append_op(
         type="generate_proposals",
@@ -3004,17 +3024,12 @@ def generate_proposals(scores,
             'min_size': min_size,
             'eta': eta
         },
-        outputs={
-            'RpnRois': rpn_rois,
-            'RpnRoiProbs': rpn_roi_probs,
-            'RpnRoisLod': rpn_rois_lod
-        })
+        outputs=outputs)
     rpn_rois.stop_gradient = True
     rpn_roi_probs.stop_gradient = True
-    rpn_rois_lod.stop_gradient = True
 
     if return_rois_num:
-        return rpn_rois, rpn_roi_probs, rpn_rois_lod
+        return rpn_rois, rpn_roi_probs, rpn_rois_num
     else:
         return rpn_rois, rpn_roi_probs
 
@@ -3656,6 +3671,7 @@ def distribute_fpn_proposals(fpn_rois,
                              max_level,
                              refer_level,
                              refer_scale,
+                             rois_num=None,
                              name=None):
     """
 	:alias_main: paddle.nn.functional.distribute_fpn_proposals
@@ -3687,6 +3703,11 @@ def distribute_fpn_proposals(fpn_rois,
             come from.
         refer_level(int32): The referring level of FPN layer with specified scale.
         refer_scale(int32): The referring scale of FPN layer with specified level.
+        rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. 
+            The shape is [B] and data type is int32. B is the number of images.
+            If it is not None then return a list of 1-D Tensor. Each element 
+            is the output RoIs' number of each image on the corresponding level
+            and the shape is [B]. None by default.
         name(str, optional): For detailed information, please refer 
             to :ref:`api_guide_Name`. Usually name is no need to set and 
             None by default. 
@@ -3702,6 +3723,10 @@ def distribute_fpn_proposals(fpn_rois,
         the number of total rois. The data type is int32. It is
         used to restore the order of fpn_rois.
 
+        rois_num_per_level(List): A list of 1-D Tensor and each Tensor is 
+        the RoIs' number in each image on the corresponding level. The shape 
+        is [B] and data type of int32. B is the number of images
+
 
     Examples:
         .. code-block:: python
@@ -3716,26 +3741,52 @@ def distribute_fpn_proposals(fpn_rois,
                 refer_level=4,
                 refer_scale=224)
     """
+    num_lvl = max_level - min_level + 1
+
+    if in_dygraph_mode():
+        assert rois_num is not None, "rois_num should not be None in dygraph mode."
+        attrs = ('min_level', min_level, 'max_level', max_level, 'refer_level',
+                 refer_level, 'refer_scale', refer_scale)
+        multi_rois, restore_ind, rois_num_per_level = core.ops.distribute_fpn_proposals(
+            fpn_rois, rois_num, num_lvl, num_lvl, *attrs)
+        return multi_rois, restore_ind, rois_num_per_level
+
     check_variable_and_dtype(fpn_rois, 'fpn_rois', ['float32', 'float64'],
                              'distribute_fpn_proposals')
     helper = LayerHelper('distribute_fpn_proposals', **locals())
     dtype = helper.input_dtype('fpn_rois')
-    num_lvl = max_level - min_level + 1
     multi_rois = [
         helper.create_variable_for_type_inference(dtype) for i in range(num_lvl)
     ]
+
     restore_ind = helper.create_variable_for_type_inference(dtype='int32')
+
+    inputs = {'FpnRois': fpn_rois}
+    outputs = {
+        'MultiFpnRois': multi_rois,
+        'RestoreIndex': restore_ind,
+    }
+
+    if rois_num is not None:
+        inputs['RoisNum'] = rois_num
+        rois_num_per_level = [
+            helper.create_variable_for_type_inference(dtype='int32')
+            for i in range(num_lvl)
+        ]
+        outputs['MultiLevelRoIsNum'] = rois_num_per_level
+
     helper.append_op(
         type='distribute_fpn_proposals',
-        inputs={'FpnRois': fpn_rois},
-        outputs={'MultiFpnRois': multi_rois,
-                 'RestoreIndex': restore_ind},
+        inputs=inputs,
+        outputs=outputs,
         attrs={
             'min_level': min_level,
             'max_level': max_level,
             'refer_level': refer_level,
             'refer_scale': refer_scale
         })
+    if rois_num is not None:
+        return multi_rois, restore_ind, rois_num_per_level
     return multi_rois, restore_ind
 
 
@@ -3820,6 +3871,7 @@ def collect_fpn_proposals(multi_rois,
                           min_level,
                           max_level,
                           post_nms_top_n,
+                          rois_num_per_level=None,
                           name=None):
     """
 	:alias_main: paddle.nn.functional.collect_fpn_proposals
@@ -3846,6 +3898,12 @@ def collect_fpn_proposals(multi_rois,
         min_level(int): The lowest level of FPN layer to collect
         max_level(int): The highest level of FPN layer to collect
         post_nms_top_n(int): The number of selected RoIs
+        rois_num_per_level(list, optional): The List of RoIs' numbers. 
+            Each element is 1-D Tensor which contains the RoIs' number of each 
+            image on each level and the shape is [B] and data type is 
+            int32, B is the number of images. If it is not None then return 
+            a 1-D Tensor contains the output RoIs' number of each image and 
+            the shape is [B]. Default: None
         name(str, optional): For detailed information, please refer 
             to :ref:`api_guide_Name`. Usually name is no need to set and 
             None by default.        
@@ -3856,6 +3914,9 @@ def collect_fpn_proposals(multi_rois,
         fpn_rois(Variable): 2-D LoDTensor with shape [N, 4] and data type is 
         float32 or float64. Selected RoIs. 
 
+        rois_num(Tensor): 1-D Tensor contains the RoIs's number of each 
+        image. The shape is [B] and data type is int32. B is the number of 
+        images. 
 
     Examples:
         .. code-block:: python
@@ -3879,21 +3940,38 @@ def collect_fpn_proposals(multi_rois,
     """
     check_type(multi_rois, 'multi_rois', list, 'collect_fpn_proposals')
     check_type(multi_scores, 'multi_scores', list, 'collect_fpn_proposals')
+    num_lvl = max_level - min_level + 1
+    input_rois = multi_rois[:num_lvl]
+    input_scores = multi_scores[:num_lvl]
+
+    if in_dygraph_mode():
+        assert rois_num_per_level is not None, "rois_num_per_level should not be None in dygraph mode."
+        attrs = ('post_nms_topN', post_nms_top_n)
+        output_rois, rois_num = core.ops.collect_fpn_proposals(
+            input_rois, input_scores, rois_num_per_level, *attrs)
+
     helper = LayerHelper('collect_fpn_proposals', **locals())
     dtype = helper.input_dtype('multi_rois')
     check_dtype(dtype, 'multi_rois', ['float32', 'float64'],
                 'collect_fpn_proposals')
-    num_lvl = max_level - min_level + 1
-    input_rois = multi_rois[:num_lvl]
-    input_scores = multi_scores[:num_lvl]
     output_rois = helper.create_variable_for_type_inference(dtype)
     output_rois.stop_gradient = True
+
+    inputs = {
+        'MultiLevelRois': input_rois,
+        'MultiLevelScores': input_scores,
+    }
+    outputs = {'FpnRois': output_rois}
+    if rois_num_per_level is not None:
+        inputs['MultiLevelRoIsNum'] = rois_num_per_level
+        rois_num = helper.create_variable_for_type_inference(dtype='int32')
+        rois_num.stop_gradient = True
+        outputs['RoisNum'] = rois_num
     helper.append_op(
         type='collect_fpn_proposals',
-        inputs={
-            'MultiLevelRois': input_rois,
-            'MultiLevelScores': input_scores
-        },
-        outputs={'FpnRois': output_rois},
+        inputs=inputs,
+        outputs=outputs,
         attrs={'post_nms_topN': post_nms_top_n})
+    if rois_num_per_level is not None:
+        return output_rois, rois_num
     return output_rois
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index 38fc34472c8bc64338e2468bdf3f4b0bab1370ce..4595f0cf93916d71a3d0ec582af1917500d68f12 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -16,7 +16,6 @@ from __future__ import print_function
 
 import warnings
 import inspect
-import paddle
 
 from .. import core
 from ..framework import Variable, unique_name
@@ -46,7 +45,6 @@ EXPRESSION_MAP = {
     "__pow__": "A ** B",
     "__rpow__": "A **= B",
     "__floordiv__": "A //B",
-    "__rfloordiv__": "A //= B",
     "__mod__": "A % B",
     "__eq__": "A == B",
     "__ne__": "A != B",
@@ -235,25 +233,6 @@ def monkey_patch_variable():
     def _scalar_div_(var, value):
         return _scalar_op_(var, 1.0 / value, 0.0)
 
-    # TODO(shenliang03):  currently, it supports divide, floor_divide, remainder
-    # for binary operator by using the api to achieve the type promotion
-    def _binary_method_creator_(op_type, reverse=False):
-        import paddle
-
-        def __impl__(self, other_var):
-            op = getattr(paddle, op_type)
-            if reverse:
-                return op(other_var, self)
-            else:
-                return op(self, other_var)
-
-        __impl__.__doc__ = """
-
-        See paddle.{}""".format(op_type)
-        __impl__.__name__ = op_type
-
-        return __impl__
-
     def _binary_creator_(method_name,
                          op_type,
                          reverse=False,
@@ -360,18 +339,22 @@ def monkey_patch_variable():
         #  a*b == b*a. Do not need to reverse explicitly
         ('__rmul__',
          _binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)),
+        ('__div__', _binary_creator_('__div__', 'elementwise_div', False,
+                                     _scalar_div_)),
+        ('__truediv__', _binary_creator_('__truediv__', 'elementwise_div',
+                                         False, _scalar_div_)),
+        ('__rdiv__', _binary_creator_('__rdiv__', 'elementwise_div', True,
+                                      None)),
+        ('__rtruediv__', _binary_creator_('__rtruediv__', 'elementwise_div',
+                                          True, None)),
         ('__pow__', _binary_creator_('__pow__', 'elementwise_pow', False,
                                      None)),
         ('__rpow__', _binary_creator_('__rpow__', 'elementwise_pow', True,
                                       None)),
-        # These binary use paddle.optype
-        ('__div__', _binary_method_creator_('divide', False)),
-        ('__rdiv__', _binary_method_creator_('divide', True)),
-        ('__truediv__', _binary_method_creator_('divide', False)),
-        ('__rtruediv__', _binary_method_creator_('divide', True)),
-        ('__floordiv__', _binary_method_creator_('floor_divide', False)),
-        ('__rfloordiv__', _binary_method_creator_('floor_divide', True)),
-        ('__mod__', _binary_method_creator_('remainder', False)),
+        ('__floordiv__', _binary_creator_('__floordiv__',
+                                          'elementwise_floordiv', False, None)),
+        ('__mod__', _binary_creator_('__mod__', 'elementwise_mod', False,
+                                     None)),
         #  for logical compare
         ('__eq__', _binary_creator_('__eq__', 'equal', False, None)),
         ('__ne__', _binary_creator_('__ne__', 'not_equal', False, None)),
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 9313de8c64fcf4efc1e192ad2826f05f51869bbf..bc9f182d95e3b728fbc0866e1c79f5508d3a04aa 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -6306,6 +6306,15 @@ def unsqueeze(input, axes, name=None):
 
     """
     if in_dygraph_mode():
+        if isinstance(axes, int):
+            axes = [axes]
+        elif isinstance(axes, Variable):
+            axes = axes.numpy().tolist()
+        elif isinstance(axes, (list, tuple)):
+            axes = [
+                item.numpy().item(0) if isinstance(item, Variable) else item
+                for item in axes
+            ]
         out, _ = core.ops.unsqueeze2(input, 'axes', axes)
         return out
 
@@ -6853,7 +6862,8 @@ def roi_pool(input,
              pooled_height=1,
              pooled_width=1,
              spatial_scale=1.0,
-             rois_lod=None):
+             rois_num=None,
+             name=None):
     """
     :alias_main: paddle.nn.functional.roi_pool
 	:alias: paddle.nn.functional.roi_pool,paddle.nn.functional.vision.roi_pool
@@ -6873,10 +6883,14 @@ def roi_pool(input,
     Args:
         input (Variable): Input feature, 4D-Tensor with the shape of [N,C,H,W], where N is the batch size, C is the input channel, H is Height, W is weight. The data type is float32 or float64.
         rois (Variable): ROIs (Regions of Interest) to pool over. 2D-LoDTensor with the shape of [num_rois,4], the lod level is 1. Given as [[x1, y1, x2, y2], ...], (x1, y1) is the top left coordinates, and (x2, y2) is the bottom right coordinates.
-        rois_lod (Variable): The lod info of rois. Default: None
         pooled_height (int, optional): The pooled output height, data type is int32. Default: 1
         pooled_width (int, optional): The pooled output height, data type is int32. Default: 1
         spatial_scale (float, optional): Multiplicative spatial scale factor to translate ROI coords from their input scale to the scale used when pooling. Default: 1.0
+        rois_num (Tensor): The number of RoIs in each image. Default: None
+        name(str, optional): For detailed information, please refer
+            to :ref:`api_guide_Name`. Usually name is no need to set and
+            None by default.
+
 
     Returns:
         Variable: The pooled feature, 4D-Tensor with the shape of [num_rois, C, pooled_height, pooled_width].
@@ -6896,11 +6910,11 @@ def roi_pool(input,
 
         input_data = np.array([i for i in range(1,17)]).reshape(1,1,4,4).astype(DATATYPE)
         roi_data =fluid.create_lod_tensor(np.array([[1., 1., 2., 2.], [1.5, 1.5, 3., 3.]]).astype(DATATYPE),[[2]], place)
-        rois_lod_data = np.array([0, 2])
+        rois_num_data = np.array([2]).astype('int32')
 
         x = fluid.data(name='input', shape=[None,1,4,4], dtype=DATATYPE)
         rois = fluid.data(name='roi', shape=[None,4], dtype=DATATYPE)
-        rois_lod = fluid.data(name='rois_lod', shape=[None], dtype='int64')
+        rois_num = fluid.data(name='rois_num', shape=[None], dtype='int32')
 
         pool_out = fluid.layers.roi_pool(
                 input=x,
@@ -6908,24 +6922,36 @@ def roi_pool(input,
                 pooled_height=1,
                 pooled_width=1,
                 spatial_scale=1.0,
-                rois_lod=rois_lod)
+                rois_num=rois_num)
 
         exe = fluid.Executor(place)
-        out, = exe.run(feed={'input':input_data ,'roi':roi_data, 'rois_lod': rois_lod_data}, fetch_list=[pool_out.name])
+        out, = exe.run(feed={'input':input_data ,'roi':roi_data, 'rois_num': rois_num_data}, fetch_list=[pool_out.name])
         print(out)   #array([[[[11.]]], [[[16.]]]], dtype=float32)
         print(np.array(out).shape)  # (2, 1, 1, 1)
     """
+    if in_dygraph_mode():
+        assert rois_num is not None, "rois_num should not be None in dygraph mode."
+        pool_out, argmaxes = core.ops.roi_pool(
+            input, rois, rois_num, "pooled_height", pooled_height,
+            "pooled_width", pooled_width, "spatial_scale", spatial_scale)
+        return pool_out, argmaxes
+
     check_variable_and_dtype(input, 'input', ['float32'], 'roi_pool')
     check_variable_and_dtype(rois, 'rois', ['float32'], 'roi_pool')
     helper = LayerHelper('roi_pool', **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
     argmaxes = helper.create_variable_for_type_inference(dtype='int32')
+
+    inputs = {
+        "X": input,
+        "ROIs": rois,
+    }
+    if rois_num is not None:
+        inputs['RoisNum'] = rois_num
     helper.append_op(
         type="roi_pool",
-        inputs={"X": input,
-                "ROIs": rois,
-                "RoisLod": rois_lod},
+        inputs=inputs,
         outputs={"Out": pool_out,
                  "Argmax": argmaxes},
         attrs={
@@ -6943,8 +6969,8 @@ def roi_align(input,
               pooled_width=1,
               spatial_scale=1.0,
               sampling_ratio=-1,
-              name=None,
-              rois_lod=None):
+              rois_num=None,
+              name=None):
     """
     :alias_main: paddle.nn.functional.roi_align
 	:alias: paddle.nn.functional.roi_align,paddle.nn.functional.vision.roi_align
@@ -6959,11 +6985,11 @@ def roi_align(input,
             data type is float32 or float64. Given as [[x1, y1, x2, y2], ...],
             (x1, y1) is the top left coordinates, and (x2, y2) is the bottom
             right coordinates.
-        rois_lod (Variable): The lod info of rois. Default: None
         pooled_height (int32, optional): ${pooled_height_comment} Default: 1
         pooled_width (int32, optional): ${pooled_width_comment} Default: 1
         spatial_scale (float32, optional): ${spatial_scale_comment} Default: 1.0
         sampling_ratio(int32, optional): ${sampling_ratio_comment} Default: -1
+        rois_num (Tensor): The number of RoIs in each image. Default: None
         name(str, optional): For detailed information, please refer
             to :ref:`api_guide_Name`. Usually name is no need to set and
             None by default.
@@ -6982,26 +7008,38 @@ def roi_align(input,
                 name='data', shape=[None, 256, 32, 32], dtype='float32')
             rois = fluid.data(
                 name='rois', shape=[None, 4], dtype='float32')
-            rois_lod = fluid.data(name='rois_lod', shape=[None], dtype='int64')
+            rois_num = fluid.data(name='rois_num', shape=[None], dtype='int32')
             align_out = fluid.layers.roi_align(input=x,
                                                rois=rois,
                                                pooled_height=7,
                                                pooled_width=7,
                                                spatial_scale=0.5,
                                                sampling_ratio=-1,
-                                               rois_lod=rois_lod)
+                                               rois_num=rois_num)
     """
+    if in_dygraph_mode():
+        assert rois_num is not None, "rois_num should not be None in dygraph mode."
+        align_out = core.ops.roi_align(
+            input, rois, rois_num, "pooled_height", pooled_height,
+            "pooled_width", pooled_width, "spatial_scale", spatial_scale,
+            "sampling_ratio", sampling_ratio)
+        return align_out
+
     check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                              'roi_align')
     check_variable_and_dtype(rois, 'rois', ['float32', 'float64'], 'roi_align')
     helper = LayerHelper('roi_align', **locals())
     dtype = helper.input_dtype()
     align_out = helper.create_variable_for_type_inference(dtype)
+    inputs = {
+        "X": input,
+        "ROIs": rois,
+    }
+    if rois_num is not None:
+        inputs['RoisNum'] = rois_num
     helper.append_op(
         type="roi_align",
-        inputs={"X": input,
-                "ROIs": rois,
-                "RoisLod": rois_lod},
+        inputs=inputs,
         outputs={"Out": align_out},
         attrs={
             "pooled_height": pooled_height,
@@ -10841,8 +10879,7 @@ def slice(input, axes, starts, ends):
                 result = [ [2, 3, 4], ] # result = data[0:1, 1:4]
     Args:
         input (Variable): A ``Tensor`` or ``LoDTensor`` . The data type is ``float16``, ``float32``, ``float64``, ``int32`` or ``int64``.
-        axes (list|tuple): The data type is ``int32`` . Axes that `starts` and `ends` apply to.
-                            It's optional. If it is not provides, it will be treated as :math:`[0,1,...,len(starts)-1]`.
+        axes (list|tuple): The data type is ``int32`` . Axes that `starts` and `ends` apply to .
         starts (list|tuple|Variable): The data type is ``int32`` . If ``starts`` is a list or tuple, the elements of
                 it should be integers or Tensors with shape [1]. If ``starts`` is an Variable, it should be an 1-D Tensor.
                 It represents starting indices of corresponding axis in ``axes``.
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 1efae3ddf1f3422a53f69c4b5b8eeec6183fae96..6cdc617a0dc17ae9f0893083285c404ca73712f7 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -20,7 +20,10 @@ from ..framework import convert_np_dtype_to_dtype_, Variable
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 from paddle.utils import deprecated
 
-__deprecated_func_name__ = {'tanh_shrink': 'tanhshrink', }
+__deprecated_func_name__ = {
+    'tanh_shrink': 'tanhshrink',
+    'logsigmoid': 'log_sigmoid'
+}
 
 __activations_noattr__ = [
     'sigmoid',
@@ -106,7 +109,7 @@ Examples:
         paddle.disable_static()
 
         x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = F.logsigmoid(x)
+        out = F.log_sigmoid(x)
         print(out.numpy())
         # [-0.91301525 -0.79813887 -0.64439666 -0.55435524]
 
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index a90551c1b7b4fd45ae9a0e1cfa225a87db811295..89acfc6075be0b625da04d187cd46dd47ac699c9 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -36,7 +36,7 @@ __all__ = [
     'tensor_array_to_tensor', 'concat', 'sums', 'assign',
     'fill_constant_batch_size_like', 'fill_constant', 'argmin', 'argmax',
     'argsort', 'ones', 'zeros', 'reverse', 'has_inf', 'has_nan', 'isfinite',
-    'range', 'linspace', 'zeros_like', 'ones_like', 'diag', 'eye'
+    'range', 'linspace', 'zeros_like', 'ones_like', 'diag', 'eye', 'triu'
 ]
 
 
@@ -1725,3 +1725,9 @@ def ones_like(x, out=None):
         attrs={'value': 1.0},
         outputs={'Out': [out]})
     return out
+
+
+@deprecated(since="2.0.0", update_to="paddle.triu")
+def triu(input, diagonal=0, name=None):
+    import paddle
+    return paddle.tensor.triu(x=input, diagonal=diagonal, name=name)
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
old mode 100644
new mode 100755
index 8b37cfef3890eace0ff5141eeb91d85e78f1c964..192effd2e42dc937fbf47efdd1d772a4c078f888
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -1604,7 +1604,7 @@ class LarsMomentumOptimizer(Optimizer):
         & local\_learning\_rate = learning\_rate * lars\_coeff * \\
           \\frac{||param||}{||gradient|| + lars\_weight\_decay * ||param||}
 
-        & velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param)
+        & velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param + epsilon)
 
         & param = param - velocity
 
@@ -1628,7 +1628,9 @@ class LarsMomentumOptimizer(Optimizer):
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): This parameter is used by developers to print debugging information. \
             For details, please refer to :ref:`api_guide_Name`. Default is None.
-
+        exclude_from_weight_decay (list[str], optional): Name string of layers which will be exclude from lars weight decay. Default is None.
+        epsilon (float, optional): Epsilon to avoid Division by Zero when calculate local lr. Default is 0.
+        
     Examples:
         .. code-block:: python
 
@@ -1659,7 +1661,9 @@ class LarsMomentumOptimizer(Optimizer):
                  parameter_list=None,
                  regularization=None,
                  grad_clip=None,
-                 name=None):
+                 name=None,
+                 exclude_from_weight_decay=None,
+                 epsilon=0):
         assert learning_rate is not None
         assert momentum is not None
         super(LarsMomentumOptimizer, self).__init__(
@@ -1672,6 +1676,11 @@ class LarsMomentumOptimizer(Optimizer):
         self._momentum = momentum
         self._lars_coeff = float(lars_coeff)
         self._lars_weight_decay = float(lars_weight_decay)
+        self._epsilon = float(epsilon)
+        if exclude_from_weight_decay is None:
+            self._exclude_from_weight_decay = []
+        else:
+            self._exclude_from_weight_decay = exclude_from_weight_decay
 
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
@@ -1682,6 +1691,14 @@ class LarsMomentumOptimizer(Optimizer):
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
 
+        _lars_weight_decay = self._lars_weight_decay
+        param_name = param_and_grad[0].name
+        if len(self._exclude_from_weight_decay) > 0:
+            for name in self._exclude_from_weight_decay:
+                if name in param_name:
+                    _lars_weight_decay = 0.0
+                    break
+
         velocity_acc = self._get_accumulator(self._velocity_acc_str,
                                              param_and_grad[0])
         # create the momentum optimize op
@@ -1700,7 +1717,8 @@ class LarsMomentumOptimizer(Optimizer):
             attrs={
                 "mu": self._momentum,
                 "lars_coeff": self._lars_coeff,
-                "lars_weight_decay": self._lars_weight_decay
+                "lars_weight_decay": _lars_weight_decay,
+                "epsilon": self._epsilon
             },
             stop_gradient=True)
 
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 76c95be75d67d60cd59efe13ecba6f01a1c1d614..533222531f98b188f9fe5b47184ff39736488bd6 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -85,6 +85,30 @@ def _convert_places(places):
     return ret
 
 
+# NOTE(chenweihang): _reader_process_loop must be top level method to be pickled
+def _reader_process_loop(batch_reader, data_queue):
+    try:
+        # set signal handler
+        core._set_process_signal_handler()
+
+        # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
+        # some shared memory objects may have been applied for but have not yet
+        # been put into the inter-process Queue. This part of the object needs
+        # to be cleaned up when the process ends.
+        CleanupFuncRegistrar.register(_cleanup_mmap)
+
+        for batch in batch_reader():
+            tensor_list = core._convert_to_tensor_list(batch)
+            data_queue.put(tensor_list)
+            core._remove_tensor_list_mmap_fds(tensor_list)
+        data_queue.put(None)
+    except KeyboardInterrupt:
+        # NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process
+        pass
+    except:
+        six.reraise(*sys.exc_info())
+
+
 class DataLoaderBase(object):
     def __init__(self):
         self._places = None
@@ -811,7 +835,8 @@ class DygraphGeneratorLoader(DataLoaderBase):
             global multiprocess_queue_set
             multiprocess_queue_set.add(self._data_queue)
             self._process = multiprocessing.Process(
-                target=self._reader_process_loop)
+                target=_reader_process_loop,
+                args=(self._batch_reader, self._data_queue))
             self._process.daemon = True
             self._process.start()
 
@@ -867,28 +892,6 @@ class DygraphGeneratorLoader(DataLoaderBase):
         self._blocking_queue.kill()
         logging.error("DataLoader reader thread raised an exception!")
 
-    def _reader_process_loop(self):
-        try:
-            # set signal handler
-            core._set_process_signal_handler()
-
-            # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
-            # some shared memory objects may have been applied for but have not yet
-            # been put into the inter-process Queue. This part of the object needs
-            # to be cleaned up when the process ends.
-            CleanupFuncRegistrar.register(_cleanup_mmap)
-
-            for batch in self._batch_reader():
-                tensor_list = core._convert_to_tensor_list(batch)
-                self._data_queue.put(tensor_list)
-                core._remove_tensor_list_mmap_fds(tensor_list)
-            self._data_queue.put(None)
-        except KeyboardInterrupt:
-            # NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process
-            pass
-        except:
-            six.reraise(*sys.exc_info())
-
     def _reader_thread_loop_for_multiprocess(self):
         while not self._thread_done_event.is_set():
             try:
@@ -1723,13 +1726,13 @@ class DatasetLoader(DataLoaderBase):
             logging.warn('thread_num {} which is set in Dataset is ignored'.
                          format(dataset.thread_num))
 
-        dataset.set_thread(thread_num)
+        dataset._set_thread(thread_num)
 
         if isinstance(dataset, paddle.distributed.fleet.dataset.
                       InMemoryDataset) and dataset.queue_num > thread_num:
             logging.warn("queue_num {} which is set in Dataset is ignored".
                          format(dataset.queue_num))
-            dataset.set_queue_num(thread_num)
+            dataset._set_queue_num(thread_num)
 
         self._dataset = dataset
         use_slots = [
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index a1526934f4aa1415c529e79bfa8dea6c0754bea9..425c4e3c7e38cff2f892eff28428082b57b3727d 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -19,6 +19,57 @@ import paddle.fluid.layers as layers
 from paddle.fluid.layers import detection
 from paddle.fluid.framework import Program, program_guard
 import unittest
+import contextlib
+import numpy as np
+from unittests.test_imperative_base import new_program_scope
+from paddle.fluid.dygraph import base
+from paddle.fluid import core
+
+
+class LayerTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.seed = 111
+
+    @classmethod
+    def tearDownClass(cls):
+        pass
+
+    def _get_place(self, force_to_use_cpu=False):
+        # this option for ops that only have cpu kernel
+        if force_to_use_cpu:
+            return core.CPUPlace()
+        else:
+            if core.is_compiled_with_cuda():
+                return core.CUDAPlace(0)
+            return core.CPUPlace()
+
+    @contextlib.contextmanager
+    def static_graph(self):
+        with new_program_scope():
+            fluid.default_startup_program().random_seed = self.seed
+            fluid.default_main_program().random_seed = self.seed
+            yield
+
+    def get_static_graph_result(self,
+                                feed,
+                                fetch_list,
+                                with_lod=False,
+                                force_to_use_cpu=False):
+        exe = fluid.Executor(self._get_place(force_to_use_cpu))
+        exe.run(fluid.default_startup_program())
+        return exe.run(fluid.default_main_program(),
+                       feed=feed,
+                       fetch_list=fetch_list,
+                       return_numpy=(not with_lod))
+
+    @contextlib.contextmanager
+    def dynamic_graph(self, force_to_use_cpu=False):
+        with fluid.dygraph.guard(
+                self._get_place(force_to_use_cpu=force_to_use_cpu)):
+            fluid.default_startup_program().random_seed = self.seed
+            fluid.default_main_program().random_seed = self.seed
+            yield
 
 
 class TestDetection(unittest.TestCase):
@@ -481,45 +532,67 @@ class TestRpnTargetAssign(unittest.TestCase):
             print(str(program))
 
 
-class TestGenerateProposals(unittest.TestCase):
+class TestGenerateProposals(LayerTest):
     def test_generate_proposals(self):
-        program = Program()
-        with program_guard(program):
-            data_shape = [20, 64, 64]
-            images = fluid.layers.data(
-                name='images', shape=data_shape, dtype='float32')
-            im_info = fluid.layers.data(
-                name='im_info', shape=[3], dtype='float32')
-            anchors, variances = fluid.layers.anchor_generator(
-                name='anchor_generator',
-                input=images,
-                anchor_sizes=[32, 64],
-                aspect_ratios=[1.0],
-                variance=[0.1, 0.1, 0.2, 0.2],
-                stride=[16.0, 16.0],
-                offset=0.5)
-            num_anchors = anchors.shape[2]
-            scores = fluid.layers.data(
-                name='scores', shape=[num_anchors, 8, 8], dtype='float32')
-            bbox_deltas = fluid.layers.data(
-                name='bbox_deltas',
-                shape=[num_anchors * 4, 8, 8],
-                dtype='float32')
-            rpn_rois, rpn_roi_probs = fluid.layers.generate_proposals(
-                name='generate_proposals',
-                scores=scores,
-                bbox_deltas=bbox_deltas,
-                im_info=im_info,
-                anchors=anchors,
-                variances=variances,
-                pre_nms_top_n=6000,
-                post_nms_top_n=1000,
-                nms_thresh=0.5,
-                min_size=0.1,
-                eta=1.0)
-            self.assertIsNotNone(rpn_rois)
-            self.assertIsNotNone(rpn_roi_probs)
-            print(rpn_rois.shape)
+        scores_np = np.random.rand(2, 3, 4, 4).astype('float32')
+        bbox_deltas_np = np.random.rand(2, 12, 4, 4).astype('float32')
+        im_info_np = np.array([[8, 8, 0.5], [6, 6, 0.5]]).astype('float32')
+        anchors_np = np.reshape(np.arange(4 * 4 * 3 * 4),
+                                [4, 4, 3, 4]).astype('float32')
+        variances_np = np.ones((4, 4, 3, 4)).astype('float32')
+
+        with self.static_graph():
+            scores = fluid.data(
+                name='scores', shape=[2, 3, 4, 4], dtype='float32')
+            bbox_deltas = fluid.data(
+                name='bbox_deltas', shape=[2, 12, 4, 4], dtype='float32')
+            im_info = fluid.data(name='im_info', shape=[2, 3], dtype='float32')
+            anchors = fluid.data(
+                name='anchors', shape=[4, 4, 3, 4], dtype='float32')
+            variances = fluid.data(
+                name='var', shape=[4, 4, 3, 4], dtype='float32')
+            rois, roi_probs, rois_num = fluid.layers.generate_proposals(
+                scores,
+                bbox_deltas,
+                im_info,
+                anchors,
+                variances,
+                pre_nms_top_n=10,
+                post_nms_top_n=5,
+                return_rois_num=True)
+            rois_stat, roi_probs_stat, rois_num_stat = self.get_static_graph_result(
+                feed={
+                    'scores': scores_np,
+                    'bbox_deltas': bbox_deltas_np,
+                    'im_info': im_info_np,
+                    'anchors': anchors_np,
+                    'var': variances_np
+                },
+                fetch_list=[rois, roi_probs, rois_num],
+                with_lod=True)
+
+        with self.dynamic_graph():
+            scores_dy = base.to_variable(scores_np)
+            bbox_deltas_dy = base.to_variable(bbox_deltas_np)
+            im_info_dy = base.to_variable(im_info_np)
+            anchors_dy = base.to_variable(anchors_np)
+            variances_dy = base.to_variable(variances_np)
+            rois, roi_probs, rois_num = fluid.layers.generate_proposals(
+                scores_dy,
+                bbox_deltas_dy,
+                im_info_dy,
+                anchors_dy,
+                variances_dy,
+                pre_nms_top_n=10,
+                post_nms_top_n=5,
+                return_rois_num=True)
+            rois_dy = rois.numpy()
+            roi_probs_dy = roi_probs.numpy()
+            rois_num_dy = rois_num.numpy()
+
+        self.assertTrue(np.array_equal(np.array(rois_stat), rois_dy))
+        self.assertTrue(np.array_equal(np.array(roi_probs_stat), roi_probs_dy))
+        self.assertTrue(np.array_equal(np.array(rois_num_stat), rois_num_dy))
 
 
 class TestYoloDetection(unittest.TestCase):
@@ -648,30 +721,81 @@ class TestMulticlassNMS2(unittest.TestCase):
             self.assertIsNotNone(index)
 
 
-class TestCollectFpnPropsals(unittest.TestCase):
+class TestCollectFpnPropsals(LayerTest):
     def test_collect_fpn_proposals(self):
-        program = Program()
-        with program_guard(program):
+        multi_bboxes_np = []
+        multi_scores_np = []
+        rois_num_per_level_np = []
+        for i in range(4):
+            bboxes_np = np.random.rand(5, 4).astype('float32')
+            scores_np = np.random.rand(5, 1).astype('float32')
+            rois_num = np.array([2, 3]).astype('int32')
+            multi_bboxes_np.append(bboxes_np)
+            multi_scores_np.append(scores_np)
+            rois_num_per_level_np.append(rois_num)
+
+        with self.static_graph():
             multi_bboxes = []
             multi_scores = []
+            rois_num_per_level = []
             for i in range(4):
-                bboxes = layers.data(
+                bboxes = fluid.data(
                     name='rois' + str(i),
-                    shape=[10, 4],
+                    shape=[5, 4],
                     dtype='float32',
-                    lod_level=1,
-                    append_batch_size=False)
-                scores = layers.data(
+                    lod_level=1)
+                scores = fluid.data(
                     name='scores' + str(i),
-                    shape=[10, 1],
+                    shape=[5, 1],
                     dtype='float32',
-                    lod_level=1,
-                    append_batch_size=False)
+                    lod_level=1)
+                rois_num = fluid.data(
+                    name='rois_num' + str(i), shape=[None], dtype='int32')
+
                 multi_bboxes.append(bboxes)
                 multi_scores.append(scores)
-            fpn_rois = layers.collect_fpn_proposals(multi_bboxes, multi_scores,
-                                                    2, 5, 10)
-            self.assertIsNotNone(fpn_rois)
+                rois_num_per_level.append(rois_num)
+
+            fpn_rois, rois_num = layers.collect_fpn_proposals(
+                multi_bboxes,
+                multi_scores,
+                2,
+                5,
+                10,
+                rois_num_per_level=rois_num_per_level)
+            feed = {}
+            for i in range(4):
+                feed['rois' + str(i)] = multi_bboxes_np[i]
+                feed['scores' + str(i)] = multi_scores_np[i]
+                feed['rois_num' + str(i)] = rois_num_per_level_np[i]
+            fpn_rois_stat, rois_num_stat = self.get_static_graph_result(
+                feed=feed, fetch_list=[fpn_rois, rois_num], with_lod=True)
+            fpn_rois_stat = np.array(fpn_rois_stat)
+            rois_num_stat = np.array(rois_num_stat)
+
+        with self.dynamic_graph():
+            multi_bboxes_dy = []
+            multi_scores_dy = []
+            rois_num_per_level_dy = []
+            for i in range(4):
+                bboxes_dy = base.to_variable(multi_bboxes_np[i])
+                scores_dy = base.to_variable(multi_scores_np[i])
+                rois_num_dy = base.to_variable(rois_num_per_level_np[i])
+                multi_bboxes_dy.append(bboxes_dy)
+                multi_scores_dy.append(scores_dy)
+                rois_num_per_level_dy.append(rois_num_dy)
+            fpn_rois_dy, rois_num_dy = fluid.layers.collect_fpn_proposals(
+                multi_bboxes_dy,
+                multi_scores_dy,
+                2,
+                5,
+                10,
+                rois_num_per_level=rois_num_per_level_dy)
+            fpn_rois_dy = fpn_rois_dy.numpy()
+            rois_num_dy = rois_num_dy.numpy()
+
+        self.assertTrue(np.array_equal(fpn_rois_stat, fpn_rois_dy))
+        self.assertTrue(np.array_equal(rois_num_stat, rois_num_dy))
 
     def test_collect_fpn_proposals_error(self):
         def generate_input(bbox_type, score_type, name):
@@ -717,20 +841,51 @@ class TestCollectFpnPropsals(unittest.TestCase):
                 post_nms_top_n=2000)
 
 
-class TestDistributeFpnProposals(unittest.TestCase):
+class TestDistributeFpnProposals(LayerTest):
     def test_distribute_fpn_proposals(self):
-        program = Program()
-        with program_guard(program):
-            fpn_rois = fluid.layers.data(
-                name='data', shape=[4], dtype='float32', lod_level=1)
-            multi_rois, restore_ind = layers.distribute_fpn_proposals(
-                fpn_rois=fpn_rois,
+        rois_np = np.random.rand(10, 4).astype('float32')
+        rois_num_np = np.array([4, 6]).astype('int32')
+        with self.static_graph():
+            rois = fluid.data(name='rois', shape=[10, 4], dtype='float32')
+            rois_num = fluid.data(name='rois_num', shape=[None], dtype='int32')
+            multi_rois, restore_ind, rois_num_per_level = layers.distribute_fpn_proposals(
+                fpn_rois=rois,
                 min_level=2,
                 max_level=5,
                 refer_level=4,
-                refer_scale=224)
-            self.assertIsNotNone(multi_rois)
-            self.assertIsNotNone(restore_ind)
+                refer_scale=224,
+                rois_num=rois_num)
+            fetch_list = multi_rois + [restore_ind] + rois_num_per_level
+            output_stat = self.get_static_graph_result(
+                feed={'rois': rois_np,
+                      'rois_num': rois_num_np},
+                fetch_list=fetch_list,
+                with_lod=True)
+            output_stat_np = []
+            for output in output_stat:
+                output_np = np.array(output)
+                if len(output_np) > 0:
+                    output_stat_np.append(output_np)
+
+        with self.dynamic_graph():
+            rois_dy = base.to_variable(rois_np)
+            rois_num_dy = base.to_variable(rois_num_np)
+            multi_rois_dy, restore_ind_dy, rois_num_per_level_dy = layers.distribute_fpn_proposals(
+                fpn_rois=rois_dy,
+                min_level=2,
+                max_level=5,
+                refer_level=4,
+                refer_scale=224,
+                rois_num=rois_num_dy)
+            output_dy = multi_rois_dy + [restore_ind_dy] + rois_num_per_level_dy
+            output_dy_np = []
+            for output in output_dy:
+                output_np = output.numpy()
+                if len(output_np) > 0:
+                    output_dy_np.append(output_np)
+
+        for res_stat, res_dy in zip(output_stat_np, output_dy_np):
+            self.assertTrue(np.array_equal(res_stat, res_dy))
 
     def test_distribute_fpn_proposals_error(self):
         program = Program()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index a25cba029dd8bac81d6b00c1d9fb710f421ce9d0..fa092ffb191601192fa5fed050b1e1f995896058 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -47,6 +47,8 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_dgc_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_private_function)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_executor)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_meta_optimizer_base)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_distributed_strategy)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_auto)
 foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
   list(REMOVE_ITEM TEST_OPS ${TEST_OP})
 endforeach()
@@ -100,6 +102,7 @@ if(WIN32)
 endif()
 
 
+LIST(REMOVE_ITEM TEST_OPS test_fleet_rolemaker_new)
 LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint)
 LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint1)
 LIST(REMOVE_ITEM TEST_OPS test_auto_checkpoint2)
@@ -397,17 +400,17 @@ py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS ${G
 py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS ${GC_ENVS})
 py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
     FLAGS_cudnn_deterministic=1 SERIAL)
-set_tests_properties(test_imperative_resnet PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+set_tests_properties(test_imperative_resnet PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
 py_test_modules(test_imperative_resnet_sorted_gradient MODULES test_imperative_resnet_sorted_gradient ENVS
         FLAGS_cudnn_deterministic=1 SERIAL)
-set_tests_properties(test_imperative_resnet_sorted_gradient PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+set_tests_properties(test_imperative_resnet_sorted_gradient PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
 py_test_modules(test_imperative_mnist MODULES test_imperative_mnist ENVS
     FLAGS_cudnn_deterministic=1)
 py_test_modules(test_imperative_mnist_sorted_gradient MODULES test_imperative_mnist_sorted_gradient ENVS
         FLAGS_cudnn_deterministic=1)
 py_test_modules(test_imperative_se_resnext MODULES test_imperative_se_resnext ENVS
     FLAGS_cudnn_deterministic=1 SERIAL)
-set_tests_properties(test_imperative_se_resnext PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+set_tests_properties(test_imperative_se_resnext PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
 py_test_modules(test_imperative_ocr_attention_model MODULES test_imperative_ocr_attention_model ENVS
         FLAGS_cudnn_deterministic=1 SERIAL)
 py_test_modules(test_install_check MODULES test_install_check ENVS
@@ -458,10 +461,12 @@ if(WITH_DISTRIBUTE)
     	   py_test_modules(test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS ${dist_ENVS})
     	   py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS})
 	   py_test_modules(test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS ${dist_ENVS})
+	   py_test_modules(test_fleet_distributed_strategy MODULES test_fleet_distributed_strategy)
+	   #py_test_modules(test_fleet_auto MODULES test_fleet_auto ENVS ${dist_ENVS})
         if(NOT WIN32)
             py_test_modules(test_fleet_localsgd_meta_optimizer MODULES test_fleet_localsgd_meta_optimizer ENVS ${dist_ENVS})
-            py_test_modules(test_fleet_lars_meta_optimizer MODULES test_fleet_lars_meta_optimizer ENVS ${dist_ENVS})
-            py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS})
+            #py_test_modules(test_fleet_lars_meta_optimizer MODULES test_fleet_lars_meta_optimizer ENVS ${dist_ENVS})
+            #py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS})
         endif(NOT WIN32)
     endif(NOT APPLE)
     if(WITH_DGC)
@@ -555,8 +560,8 @@ endif()
 set_tests_properties(test_parallel_executor_test_while_train test_parallel_executor_mnist
         test_parallel_executor_feed_persistable_var
         test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass
-        test_data_norm_op test_imperative_using_non_zero_gpu test_fuse_bn_act_pass
-        test_optimizer_in_control_flow test_dataloader_keep_order
+        test_data_norm_op test_imperative_using_non_zero_gpu
+        test_dataloader_keep_order
         test_dataloader_unkeep_order
         test_parallel_executor_fetch_isolated_var
         test_parallel_executor_inference_feed_partial_data
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
index dc39472d7aed8f52ee3bb0f85a5e503db9093070..1b0ce0c03e7c64300ec01a863d185027bb2302a5 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
@@ -208,14 +208,16 @@ class TestDistCTR2x2(FleetDistRunnerBase):
         filelist = train_file_list
 
         # config dataset
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset()
-        dataset.set_batch_size(batch_size)
-        dataset.set_use_var(self.feeds)
+        dataset = paddle.distributed.QueueDataset()
         pipe_command = 'python ctr_dataset_reader.py'
-        dataset.set_pipe_command(pipe_command)
+
+        dataset.init(
+            batch_size=batch_size,
+            use_var=self.feeds,
+            pipe_command=pipe_command,
+            thread_num=thread_num)
 
         dataset.set_filelist(filelist)
-        dataset.set_thread(thread_num)
 
         for epoch_id in range(1):
             pass_start = time.time()
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
index 03d0fa447daf3e3a502e7d77491045f92695496c..0e3c80992771424e4216a79b991de1c62884c757 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
@@ -114,14 +114,14 @@ class TestDistGpuPsCTR2x2(TestDistCTR2x2):
             filelist.append(train_file_path)
 
         # config dataset
-        dataset = paddle.fleet.DatasetFactory().create_dataset()
-        dataset.set_batch_size(batch_size)
-        dataset.set_use_var(self.feeds)
+        dataset = paddle.distributed.QueueDataset()
+        dataset._set_batch_size(batch_size)
+        dataset._set_use_var(self.feeds)
         pipe_command = 'python ctr_dataset_reader.py'
-        dataset.set_pipe_command(pipe_command)
+        dataset._set_pipe_command(pipe_command)
 
         dataset.set_filelist(filelist)
-        dataset.set_thread(thread_num)
+        dataset._set_thread(thread_num)
 
         for epoch_id in range(1):
             pass_start = time.time()
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
index 7a4e7534f07391956cd94577847c8a8f77895818..a5633bb0450f9aca73a9dc9a9e6fdfbdc69d2f7b 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
@@ -183,14 +183,14 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
         print("filelist: {}".format(filelist))
 
         # config dataset
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset()
-        dataset.set_batch_size(batch_size)
-        dataset.set_use_var(self.feeds)
+        dataset = paddle.distributed.QueueDataset()
+        dataset._set_batch_size(batch_size)
+        dataset._set_use_var(self.feeds)
         pipe_command = 'python ctr_dataset_reader.py'
-        dataset.set_pipe_command(pipe_command)
+        dataset._set_pipe_command(pipe_command)
 
         dataset.set_filelist(filelist)
-        dataset.set_thread(thread_num)
+        dataset._set_thread(thread_num)
 
         for epoch_id in range(1):
             pass_start = time.time()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
index 0b8df63d666b6547d5dccfc2ce0b420d653cc544..5582a65304d3e9bad2d4621e11f8a4f312189a9a 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
@@ -47,8 +47,8 @@ class SimpleNet(Layer):
         return z
 
     @declarative(input_spec=[[InputSpec([None, 10]), InputSpec([None, 10])]])
-    def func_with_list(self, l):
-        x, y, int_val = l
+    def func_with_list(self, l, int_val=1):
+        x, y = l
         z = x + y
         z = z + int_val
         return z
@@ -60,10 +60,7 @@ class SimpleNet(Layer):
     def func_with_dict(self, d):
         x = d['x']
         y = d['y']
-        int_val = d['int_val']
-
         z = x + y
-        z = z + int_val
 
         return z
 
@@ -131,10 +128,10 @@ class TestInputSpec(unittest.TestCase):
             self.assertTrue(len(net.add_func.program_cache) == 1)
 
             # 5. test input with list
-            out = net.func_with_list([x, y, int_val])
+            out = net.func_with_list([x, y], int_val)
 
             # 6. test input with dict
-            out = net.func_with_dict({'x': x, 'y': y, 'int_val': int_val})
+            out = net.func_with_dict({'x': x, 'y': y})
 
             # 7. test input with lits contains dict
             int_np = np.ones([1]).astype('float32')
@@ -293,6 +290,30 @@ class TestDifferentInputSpecCacheProgram(unittest.TestCase):
                 foo_3.concrete_program
 
 
+class TestInputDefaultName(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.net = SimpleNet()
+
+    def assert_default_name(self, func_name, input_names):
+        decorated_func = getattr(self.net, func_name)
+
+        spec_names = [x.name for x in decorated_func.inputs]
+        self.assertListEqual(spec_names, input_names)
+
+    def test_common_input(self):
+        self.assert_default_name('forward', ['x'])
+
+    def test_list_input(self):
+        self.assert_default_name('func_with_list', ['l_0', 'l_1'])
+
+    def test_dict_input(self):
+        self.assert_default_name('func_with_dict', ['x', 'y'])
+
+    def test_nest_input(self):
+        self.assert_default_name('func_with_list_dict', ['dl_0', 'x', 'y'])
+
+
 class TestDeclarativeAPI(unittest.TestCase):
     def test_error(self):
         func = declarative(dyfunc_to_variable)
@@ -311,5 +332,31 @@ class TestDeclarativeAPI(unittest.TestCase):
             func(np.ones(5).astype("int32"))
 
 
+class TestDecorateModelDirectly(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        program_trans.enable(True)
+        self.x = to_variable(np.ones([4, 10]).astype('float32'))
+
+    def test_fake_input(self):
+        net = SimpleNet()
+        net = declarative(net)
+        y = net(self.x)
+        self.assertTrue(len(net.forward.program_cache) == 1)
+
+    def test_input_spec(self):
+        net = SimpleNet()
+        net = declarative(net, input_spec=[InputSpec([None, 8, 10])])
+        self.assertTrue(len(net.forward.inputs) == 1)
+        self.assertTrue(len(net.forward.program_cache) == 1)
+        input_shape = net.forward.inputs[0].shape
+        self.assertListEqual(list(input_shape), [-1, 8, 10])
+
+        # redecorate
+        net = declarative(net, input_spec=[InputSpec([None, 16, 10])])
+        input_shape = net.forward.inputs[0].shape
+        self.assertListEqual(list(input_shape), [-1, 16, 10])
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
index 586020d434519b12c6fff4cbba812a013cf45c3d..2998ba85757e7677d5f9ab39ff81682a8b315072 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
@@ -14,15 +14,15 @@
 
 from __future__ import print_function
 
+import os
 import inspect
 import unittest
-
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.core import EnforceNotMet
-from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA, ErrorData
+from paddle.fluid.dygraph.dygraph_to_static import error
 from paddle.fluid.dygraph.dygraph_to_static.origin_info import unwrap
-from paddle.fluid.dygraph.jit import declarative
 
 
 def inner_func():
@@ -30,7 +30,7 @@ def inner_func():
     return
 
 
-@declarative
+@paddle.jit.to_static
 def func_error_in_compile_time(x):
     x = fluid.dygraph.to_variable(x)
     inner_func()
@@ -41,14 +41,14 @@ def func_error_in_compile_time(x):
     return x_v
 
 
-@declarative
+@paddle.jit.to_static
 def func_error_in_compile_time_2(x):
     x = fluid.dygraph.to_variable(x)
     x = fluid.layers.reshape(x, shape=[1, 2])
     return x
 
 
-@declarative
+@paddle.jit.to_static
 def func_error_in_runtime(x, iter_num=3):
     x = fluid.dygraph.to_variable(x)
     two = fluid.layers.fill_constant(shape=[1], value=2, dtype="int32")
@@ -61,6 +61,9 @@ class TestErrorInCompileTime(unittest.TestCase):
         self.set_func()
         self.set_input()
         self.set_exception_type()
+        self.prog_trans = paddle.jit.ProgramTranslator()
+        self.simplify_error = 1
+        self.disable_error = 0
 
     def set_func(self):
         self.func = func_error_in_compile_time
@@ -88,14 +91,38 @@ class TestErrorInCompileTime(unittest.TestCase):
         for m in self.expected_message:
             self.assertIn(m, error_message)
 
-    def test(self):
-        with fluid.dygraph.guard():
-            with self.assertRaises(self.exception_type) as cm:
-                self.func(self.input)
-            exception = cm.exception
-            error_data = getattr(exception, ERROR_DATA)
-            self.assertIsInstance(error_data, ErrorData)
-            self._test_create_message(error_data)
+    def _test_attach_and_raise_new_exception(self, func_call):
+        paddle.disable_static()
+        with self.assertRaises(self.exception_type) as cm:
+            func_call()
+        exception = cm.exception
+
+        error_data = getattr(exception, error.ERROR_DATA, None)
+
+        self.assertIsInstance(error_data, error.ErrorData)
+        self._test_create_message(error_data)
+
+    def test_static_layer_call(self):
+        # NOTE: self.func(self.input) is the StaticLayer().__call__(self.input)
+        call_dy2static = lambda: self.func(self.input)
+
+        self.set_flags(0)
+        self._test_attach_and_raise_new_exception(call_dy2static)
+
+    def test_program_translator_get_output(self):
+        call_dy2static = lambda : self.prog_trans.get_output(unwrap(self.func), self.input)
+
+        self.set_flags(0)
+        self._test_attach_and_raise_new_exception(call_dy2static)
+
+    def set_flags(self, disable_error=0, simplify_error=1):
+        os.environ[error.DISABLE_ERROR_ENV_NAME] = str(disable_error)
+        self.disable_error = int(os.getenv(error.DISABLE_ERROR_ENV_NAME, 0))
+        self.assertEqual(self.disable_error, disable_error)
+
+        os.environ[error.SIMPLIFY_ERROR_ENV_NAME] = str(simplify_error)
+        self.simplify_error = int(os.getenv(error.SIMPLIFY_ERROR_ENV_NAME, 1))
+        self.assertEqual(self.simplify_error, simplify_error)
 
 
 class TestErrorInCompileTime2(TestErrorInCompileTime):
@@ -143,5 +170,28 @@ class TestErrorInRuntime(TestErrorInCompileTime):
             self.assertIn(m, error_message)
 
 
+@unwrap
+@paddle.jit.to_static()
+def func_decorated_by_other_1():
+    return 1
+
+
+@paddle.jit.to_static()
+@unwrap
+def func_decorated_by_other_2():
+    return 1
+
+
+class TestErrorInOther(unittest.TestCase):
+    def test(self):
+        paddle.disable_static()
+        prog_trans = paddle.jit.ProgramTranslator()
+        with self.assertRaises(NotImplementedError):
+            prog_trans.get_output(func_decorated_by_other_1)
+
+        with self.assertRaises(NotImplementedError):
+            func_decorated_by_other_2()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
index b03777b6ebc7f3cceb73cd32e6fdfea11755320e..3f77e9ade285e2c3d8452ea2171505442ee52fb0 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import sys
 import unittest
 
 from paddle.fluid.dygraph.dygraph_to_static.ast_transformer import DygraphToStaticAst
@@ -177,8 +178,20 @@ class TestOriginInfoWithDecoratedFunc(TestOriginInfo):
 
     def set_dygraph_info(self):
         self.line_num = 2
-        self.line_index_list = [0, 2]
-        self.dy_rel_lineno_list = [0, 2]
+
+        # NOTE(liym27):
+        #   There are differences in ast_node.lineno between PY3.8+ and PY3.8-.
+        #   If the first gast.FunctionDef has decorator, the lineno of gast.FunctionDef is differs.
+        #       1. < PY3.8
+        #           its lineno equals to the lineno of the first decorator node, which is not right.
+        #       2. >= PY3.8
+        #           its lineno is the actual lineno, which is right.
+        if sys.version_info >= (3, 8):
+            self.line_index_list = [1, 2]
+            self.dy_rel_lineno_list = [1, 2]
+        else:
+            self.line_index_list = [0, 2]
+            self.dy_rel_lineno_list = [0, 2]
         self.dy_abs_col_offset = [0, 4]
         self.dy_func_name = [self.dygraph_func.__name__] * self.line_num
 
@@ -199,8 +212,13 @@ class TestOriginInfoWithDecoratedFunc2(TestOriginInfo):
 
     def set_dygraph_info(self):
         self.line_num = 2
-        self.line_index_list = [0, 3]
-        self.dy_rel_lineno_list = [0, 3]
+
+        if sys.version_info >= (3, 8):
+            self.line_index_list = [2, 3]
+            self.dy_rel_lineno_list = [2, 3]
+        else:
+            self.line_index_list = [0, 3]
+            self.dy_rel_lineno_list = [0, 3]
         self.dy_abs_col_offset = [0, 4]
         self.dy_func_name = [self.dygraph_func.__name__] * self.line_num
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
index f0fbe54f9dbbf93121655e784601467c13b3a70d..91067f360995e1661c200df923a698f3f146b71e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.dygraph import declarative, ProgramTranslator
@@ -151,5 +152,33 @@ class TestWithTrainAndEval(unittest.TestCase):
                              partial_layer._train_program)
 
 
+class GPT2LMHeadModel(fluid.dygraph.Layer):
+    def __init__(self):
+        super(GPT2LMHeadModel, self).__init__()
+        self.embedding0 = paddle.nn.Embedding(20, 16)
+        self.embedding1 = paddle.nn.Embedding(20, 32)
+        self.lm_head_weight = paddle.to_tensor(
+            np.random.rand(2, 3).astype('float32'))
+
+    @declarative
+    def forward(self, x):
+        x = fluid.layers.reshape(x, shape=[-1, 6])
+        x1, x2, x3 = fluid.layers.split(input=x, dim=1, num_or_sections=3)
+        return x1
+
+
+class TestPruneUnusedParamInProgram(unittest.TestCase):
+    def test_prune(self):
+        input_ids = np.array([[15, 11, 6, 3, 18, 13]]).astype("float32")
+
+        place = fluid.CPUPlace()
+        with fluid.dygraph.guard(place):
+            model = GPT2LMHeadModel()
+            model.eval()
+            input_ids = paddle.to_tensor(input_ids)
+            out = model(input_ids)
+            self.assertTrue(np.array_equal(out.numpy(), [[15, 11]]))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
index dfcd1758db2b22b211f84be528739aa71132ab8a..34a52e7aed342ac8db471ad94b277efd0faf9d27 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
@@ -17,6 +17,7 @@ import numpy as np
 from inference_pass_test import InferencePassTest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
 
 
 class TransposeFlattenConcatFusePassTest(InferencePassTest):
@@ -45,6 +46,37 @@ class TransposeFlattenConcatFusePassTest(InferencePassTest):
             use_gpu = True
             self.check_output_with_option(use_gpu)
 
+        PassVersionChecker.IsCompatible('transpose_flatten_concat_fuse_pass')
+
+
+class TransposeFlattenConcatFusePassWithAxisTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data1 = fluid.data(name="data1", shape=[5, 5, 5], dtype="float32")
+            data2 = fluid.data(name="data2", shape=[5, 5, 5], dtype="float32")
+            trans1 = fluid.layers.transpose(data1, perm=[2, 1, 0])
+            trans2 = fluid.layers.transpose(data2, perm=[2, 1, 0])
+            flatt1 = fluid.layers.flatten(trans1, axis=2)
+            flatt2 = fluid.layers.flatten(trans2, axis=2)
+            concat_out = fluid.layers.concat([flatt1, flatt2], axis=1)
+            # There is no parameters for above structure. 
+            # Hence, append a batch_norm to avoid failure caused by load_combined. 
+            out = fluid.layers.batch_norm(concat_out, is_test=True)
+
+        self.feeds = {
+            "data1": np.random.random([5, 5, 5]).astype("float32"),
+            "data2": np.random.random([5, 5, 5]).astype("float32")
+        }
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        # There is no cpu pass for transpose_flatten_concat_fuse
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+
+        PassVersionChecker.IsCompatible('transpose_flatten_concat_fuse_pass')
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pad_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pad_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..060f6c6c5f0446661e886390637714ad7dfc300d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pad_op.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+
+
+class PadOpTRTTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[1, 3, 128, 128], dtype="float32")
+            pad_out = fluid.layers.pad(x=data,
+                                       paddings=[0, 0, 0, 0, 0, 1, 1, 2],
+                                       pad_value=0.0)
+            out = fluid.layers.batch_norm(pad_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 128, 128)).astype("float32")
+        }
+        self.enable_trt = True
+        self.trt_parameters = PadOpTRTTest.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py
new file mode 100644
index 0000000000000000000000000000000000000000..660a9c93e66715f41e4a972ff571c0c00f31316f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+
+
+#normal starts && ends
+class SlicePluginTRTTest1(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[3, 3, 3, 3], dtype="float32")
+            axes = [1, 3]
+            starts = [0, 1]
+            ends = [2, 3]
+            slice_out = fluid.layers.slice(
+                data, axes=axes, starts=starts, ends=ends)
+            out = fluid.layers.batch_norm(slice_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random((3, 3, 3, 3)).astype("float32"),
+        }
+        # Diff occurred between GPU and TRT. 
+        # In order to provide TRT CI ASAP, this test for trt part 
+        # is disabled temporarily. 
+        self.enable_trt = True
+        self.trt_parameters = SlicePluginTRTTest1.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+#negative starts && ends
+class SlicePluginTRTTest2(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[3, 3, 3, 3], dtype="float32")
+            axes = [2, 3]
+            starts = [-3, -2]
+            ends = [-1, 3]
+            slice_out = fluid.layers.slice(
+                data, axes=axes, starts=starts, ends=ends)
+            out = fluid.layers.batch_norm(slice_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random((3, 3, 3, 3)).astype("float32"),
+        }
+        # Diff occurred between GPU and TRT. 
+        # In order to provide TRT CI ASAP, this test for trt part 
+        # is disabled temporarily. 
+        self.enable_trt = True
+        self.trt_parameters = SlicePluginTRTTest2.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+#exceeded bound starts && ends
+class SlicePluginTRTTest3(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[3, 3, 3, 3], dtype="float32")
+            axes = [2, 3]
+            starts = [-5, -2]
+            ends = [-1, 8]
+            slice_out = fluid.layers.slice(
+                data, axes=axes, starts=starts, ends=ends)
+            out = fluid.layers.batch_norm(slice_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random((3, 3, 3, 3)).astype("float32"),
+        }
+        # Diff occurred between GPU and TRT. 
+        # In order to provide TRT CI ASAP, this test for trt part 
+        # is disabled temporarily. 
+        self.enable_trt = True
+        self.trt_parameters = SlicePluginTRTTest3.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+#fp16
+class SlicePluginTRTTest4(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[3, 3, 3, 3], dtype="float32")
+            axes = [2, 3]
+            starts = [-5, -2]
+            ends = [-1, 8]
+            slice_out = fluid.layers.slice(
+                data, axes=axes, starts=starts, ends=ends)
+            out = fluid.layers.batch_norm(slice_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random((3, 3, 3, 3)).astype("float32"),
+        }
+        # Diff occurred between GPU and TRT. 
+        # In order to provide TRT CI ASAP, this test for trt part 
+        # is disabled temporarily. 
+        self.enable_trt = True
+        self.trt_parameters = SlicePluginTRTTest3.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Half, False, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff4531f0e250e325f39ef69161c8d1ee751a2336
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py
@@ -0,0 +1,145 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.test_fusion_gru_op import fusion_gru
+from paddle.fluid.tests.unittests.test_fusion_lstm_op import fc, ACTIVATION
+
+
+class TestFusionGRUINT8MKLDNNOp(OpTest):
+    def set_confs(self):
+        pass
+
+    def setUp(self):
+        self.op_type = "fusion_gru"
+        self.lod = [[2, 4, 3]]
+        self.IC = 3
+        self.OC = 5
+        self.is_reverse = False
+        self.with_h0 = False
+        self.with_bias = True
+        self.act_state = 'tanh'
+        self.act_gate = 'sigmoid'
+        self.origin_mode = True
+        self.use_mkldnn = True
+        self.force_fp32_output = True
+        self.error_margin = 1e-5
+        self.set_confs()
+
+        # RNN dimensions
+        T = sum(self.lod[0])
+        N = len(self.lod[0])
+
+        # Input data
+        x_f32 = np.random.rand(T, self.IC).astype('float32') * 2 - 1
+        scale_data = 63
+        shift_data = 64
+        x_u8 = (x_f32 * scale_data + shift_data).astype(np.uint8)
+
+        # WeightX/WeightH data
+        wx = np.random.rand(self.IC, 3 * self.OC).astype('float32') * 2 - 1
+        wh = np.random.rand(self.OC, 3 * self.OC).astype('float32') * 2 - 1
+
+        # Calculating weight scales
+        # scales = 63 / max(abs(channel_wise(weightsX + weightsH)))
+        # WeightX data shape in PP: [IC, 3 * OC]
+        # WeightH data shape in PP: [OC, 2 * OC] + [OC, OC]
+        # Scales shape in oneDNN:   [3, OC]
+        scale_ur = 63 / np.max(np.abs(
+            np.concatenate(
+                [
+                    wx[:, :2 * self.OC], wh.flatten()[:2 * self.OC * self.OC]
+                    .reshape(self.OC, 2 * self.OC)
+                ],
+                axis=0)),
+                               axis=0)
+        scale_o = 63 / np.max(np.abs(
+            np.concatenate(
+                [
+                    wx[:, 2 * self.OC:], wh.flatten()[2 * self.OC * self.OC:]
+                    .reshape(self.OC, self.OC)
+                ],
+                axis=0)),
+                              axis=0)
+
+        scale_weights = np.concatenate([scale_ur, scale_o]).astype('float')
+
+        bias = np.random.rand(
+            1, 3 * self.OC).astype('float32') if self.with_bias else np.zeros(
+                (1, 3 * self.OC), dtype='float32')
+        h0 = np.random.rand(
+            N, self.OC).astype('float32') if self.with_h0 else np.zeros(
+                (N, self.OC), dtype='float32')
+
+        _, _, _, hidden_f32 = fusion_gru(x_f32, self.lod, h0, wx, wh, bias,
+                                         self.is_reverse, self.origin_mode,
+                                         ACTIVATION[self.act_state],
+                                         ACTIVATION[self.act_gate])
+
+        self.inputs = {'X': (x_u8, self.lod), 'WeightX': wx, 'WeightH': wh}
+
+        if self.with_bias:
+            self.inputs['Bias'] = bias
+
+        if self.with_h0:
+            self.inputs['H0'] = h0
+
+        if self.force_fp32_output:
+            self.error_margin = 1e-1
+            self.outputs = {'Hidden': (hidden_f32, self.lod)}
+        else:
+            self.error_margin = 1
+            hidden_u8 = (hidden_f32 * scale_data + shift_data).astype(np.uint8)
+            self.outputs = {'Hidden': (hidden_u8, self.lod)}
+
+        self.attrs = {
+            'activation': self.act_state,
+            'gate_activation': self.act_gate,
+            'is_reverse': self.is_reverse,
+            'origin_mode': self.origin_mode,
+            'use_mkldnn': self.use_mkldnn,
+            'force_fp32_output': self.force_fp32_output,
+            'Scale_data': scale_data,
+            'Shift_data': shift_data,
+            'Scale_weights': scale_weights
+        }
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False, atol=self.error_margin)
+
+
+class TestFusionGRUINT8MKLDNNOp2(TestFusionGRUINT8MKLDNNOp):
+    def set_confs(self):
+        self.force_fp32_output = False
+
+
+class TestFusionGRUINT8MKLDNNOp3(TestFusionGRUINT8MKLDNNOp):
+    def set_confs(self):
+        self.origin_mode = False
+
+
+class TestFusionGRUINT8MKLDNNOp4(TestFusionGRUINT8MKLDNNOp):
+    def set_confs(self):
+        self.with_bias = False
+
+
+class TestFusionGRUINT8MKLDNNOp5(TestFusionGRUINT8MKLDNNOp):
+    def set_confs(self):
+        self.with_h0 = False
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index ab61a5b3cfccb0e885debe9786ae91a9754e9345..f6ba03194aa909279aa2cd884fc575041b01a4cd 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -128,7 +128,7 @@ class TestLogSigmoid(TestActivation):
 
 
 class TestLogSigmoidAPI(unittest.TestCase):
-    # test paddle.nn.LogSigmoid, paddle.nn.functional.logsigmoid
+    # test paddle.nn.LogSigmoid, paddle.nn.functional.log_sigmoid
     def setUp(self):
         self.x_np = np.random.uniform(-1, 1, [11, 17]).astype('float32')
         self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
@@ -137,36 +137,45 @@ class TestLogSigmoidAPI(unittest.TestCase):
     def test_static_api(self):
         with paddle.static.program_guard(paddle.static.Program()):
             x = paddle.data('X', [11, 17])
-            out1 = F.logsigmoid(x)
+            out1 = F.log_sigmoid(x)
             m = paddle.nn.LogSigmoid()
             out2 = m(x)
             exe = paddle.static.Executor(self.place)
             res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
         out_ref = np.log(1 / (1 + np.exp(-self.x_np)))
         for r in res:
-            self.assertEqual(np.allclose(out_ref, r), True)
+            self.assertTrue(np.allclose(out_ref, r))
 
     def test_dygraph_api(self):
         paddle.disable_static(self.place)
         x = paddle.to_tensor(self.x_np)
-        out1 = F.logsigmoid(x)
+        out1 = F.log_sigmoid(x)
         m = paddle.nn.LogSigmoid()
         out2 = m(x)
         out_ref = np.log(1 / (1 + np.exp(-self.x_np)))
         for r in [out1, out2]:
-            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+            self.assertTrue(np.allclose(out_ref, r.numpy()))
         paddle.enable_static()
 
+    def test_fluid_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [11, 17])
+            out = paddle.fluid.layers.logsigmoid(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = np.log(1 / (1 + np.exp(-self.x_np)))
+        self.assertTrue(np.allclose(out_ref, res[0]))
+
     def test_errors(self):
         with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, F.logsigmoid, 1)
+            self.assertRaises(TypeError, F.log_sigmoid, 1)
             # The input dtype must be float16, float32, float64.
             x_int32 = paddle.data(name='x_int32', shape=[11, 17], dtype='int32')
-            self.assertRaises(TypeError, F.logsigmoid, x_int32)
+            self.assertRaises(TypeError, F.log_sigmoid, x_int32)
             # support the input dtype is float16
             x_fp16 = paddle.data(name='x_fp16', shape=[11, 17], dtype='float16')
-            F.logsigmoid(x_fp16)
+            F.log_sigmoid(x_fp16)
 
 
 class TestTanh(TestActivation, TestParameter):
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index 14e83fccd655527d8f3012365e4757d23236a445..47bf8f49e39b6451ee480d461e83324b89cacee2 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -448,7 +448,6 @@ class TestAdamOpV2(unittest.TestCase):
 
     def test_adam_op_with_state_dict(self):
 
-        import paddle
         paddle.disable_static()
         emb = paddle.nn.Embedding(10, 10)
 
@@ -517,6 +516,20 @@ class TestAdamOpV2(unittest.TestCase):
             adam = paddle.optimizer.Adam(
                 0.1, epsilon=-1, parameters=linear.parameters())
 
+    def test_adam_op_with_sparse_input_and_weight_decay(self):
+
+        paddle.disable_static()
+        x_data = np.arange(0, 10).reshape((10, 1)).astype(np.int64)
+        x = paddle.to_tensor(x_data, stop_gradient=False)
+        emb = paddle.nn.Embedding(10, 10, sparse=True)
+        adam = paddle.optimizer.Adam(
+            0.001, parameters=emb.parameters(), weight_decay=0.01)
+
+        with self.assertRaises(RuntimeError):
+            out = emb(x)
+            out.backward()
+            adam.step()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py
index 5a135cea52903a0d896df2d446b58d99e5a18993..424406c15bb18bade54a9b11bfdd96862d4df85c 100644
--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py
@@ -94,6 +94,10 @@ class TestPool1d_API(unittest.TestCase):
             result = ada_max_pool1d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
+            result = paddle.nn.functional.common.interpolate(
+                input, mode="area", size=16)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
     def check_adaptive_avg_static_results(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
index 55c30e3d2ade0725e6debcdd0a69ca4eee622aec..e3c70884ebcf116feb4f5b0aa808c71e4b7f8c4e 100644
--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
@@ -163,6 +163,9 @@ class TestAdaptiveAvgPool2dAPI(unittest.TestCase):
             out_5 = paddle.nn.functional.adaptive_avg_pool2d(
                 x=x, output_size=[None, 3])
 
+            out_6 = paddle.nn.functional.interpolate(
+                x=x, mode="area", size=[2, 5])
+
             assert np.allclose(out_1.numpy(), self.res_1_np)
 
             assert np.allclose(out_2.numpy(), self.res_2_np)
@@ -173,6 +176,8 @@ class TestAdaptiveAvgPool2dAPI(unittest.TestCase):
 
             assert np.allclose(out_5.numpy(), self.res_5_np)
 
+            assert np.allclose(out_6.numpy(), self.res_3_np)
+
 
 class TestAdaptiveAvgPool2dClassAPI(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
index c04ee660667edaff01d7029e83b912c05429a15f..a3c9dd91a69ea83b08c3f817403620460333b5e9 100755
--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
@@ -178,6 +178,9 @@ class TestAdaptiveAvgPool3dAPI(unittest.TestCase):
             out_5 = paddle.nn.functional.adaptive_avg_pool3d(
                 x=x, output_size=[None, 3, None])
 
+            out_6 = paddle.nn.functional.interpolate(
+                x=x, mode="area", size=[2, 3, 5])
+
             assert np.allclose(out_1.numpy(), self.res_1_np)
 
             assert np.allclose(out_2.numpy(), self.res_2_np)
@@ -188,6 +191,8 @@ class TestAdaptiveAvgPool3dAPI(unittest.TestCase):
 
             assert np.allclose(out_5.numpy(), self.res_5_np)
 
+            assert np.allclose(out_6.numpy(), self.res_3_np)
+
 
 class TestAdaptiveAvgPool3dClassAPI(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_amp_check_finite_and_scale_op.py b/python/paddle/fluid/tests/unittests/test_amp_check_finite_and_scale_op.py
index 70863d3857c43c84a583f0ccf7b9bd733fdb4fd0..fbacaa3d5ce10bdad6dd87fdfc04c1173aff18ff 100644
--- a/python/paddle/fluid/tests/unittests/test_amp_check_finite_and_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/test_amp_check_finite_and_scale_op.py
@@ -18,9 +18,9 @@ from op_test import OpTest, skip_check_grad_ci
 import paddle.fluid as fluid
 
 
-class TestAmpCheckFiniteAndScaleOp(OpTest):
+class TestCheckFiniteAndUnscaleOp(OpTest):
     def setUp(self):
-        self.op_type = "amp_check_finite_and_scale"
+        self.op_type = "check_finite_and_unscale"
         self.init_dtype()
         x = np.random.random((1024, 1024)).astype(self.dtype)
         scale = np.random.random((1)).astype(self.dtype)
@@ -28,7 +28,7 @@ class TestAmpCheckFiniteAndScaleOp(OpTest):
         self.inputs = {'X': [('x0', x)], 'Scale': scale}
         self.outputs = {
             'FoundInfinite': np.array([0]),
-            'Out': [('out0', x * scale)],
+            'Out': [('out0', x / scale)],
         }
 
     def init_dtype(self):
@@ -38,9 +38,9 @@ class TestAmpCheckFiniteAndScaleOp(OpTest):
         self.check_output()
 
 
-class TestAmpCheckFiniteAndScaleOpWithNan(OpTest):
+class TestCheckFiniteAndUnscaleOpWithNan(OpTest):
     def setUp(self):
-        self.op_type = "amp_check_finite_and_scale"
+        self.op_type = "check_finite_and_unscale"
         self.init_dtype()
         x = np.random.random((1024, 1024)).astype(self.dtype)
         x[128][128] = np.nan
@@ -61,9 +61,9 @@ class TestAmpCheckFiniteAndScaleOpWithNan(OpTest):
         self.check_output(no_check_set=['Out'])
 
 
-class TestAmpCheckFiniteAndScaleOpWithInf(OpTest):
+class TestCheckFiniteAndUnscaleOpWithInf(OpTest):
     def setUp(self):
-        self.op_type = "amp_check_finite_and_scale"
+        self.op_type = "check_finite_and_unscale"
         self.init_dtype()
         x = np.random.random((1024, 1024)).astype(self.dtype)
         x[128][128] = np.inf
diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
index 1b1b1d7c983282974d2fa46038c35c98de4f9ec2..74f76030a29d2c9ce27278b61548c8877c1467ad 100644
--- a/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
@@ -325,16 +325,16 @@ class TestArgMinMaxOpError(unittest.TestCase):
             def test_argmax_dtype_type():
                 data = paddle.static.data(
                     name="test_argmax", shape=[10], dtype="float32")
-                output = paddle.argmax(x=data, dtype=1)
+                output = paddle.argmax(x=data, dtype=None)
 
-            self.assertRaises(TypeError, test_argmax_dtype_type)
+            self.assertRaises(ValueError, test_argmax_dtype_type)
 
             def test_argmin_dtype_type():
                 data = paddle.static.data(
                     name="test_argmin", shape=[10], dtype="float32")
-                output = paddle.argmin(x=data, dtype=1)
+                output = paddle.argmin(x=data, dtype=None)
 
-            self.assertRaises(TypeError, test_argmin_dtype_type)
+            self.assertRaises(ValueError, test_argmin_dtype_type)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
index 01daea32167d28edbb46d6854872976aed79494e..b1ec74411987a73cf2e6a7d60aecce6c87ed598e 100644
--- a/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
@@ -53,6 +53,8 @@ def value_bound(input, w, h, x, y):
 def bicubic_interp_np(input,
                       out_h,
                       out_w,
+                      scale_h=0,
+                      scale_w=0,
                       out_size=None,
                       actual_shape=None,
                       align_corners=True,
@@ -73,13 +75,19 @@ def bicubic_interp_np(input,
         if (align_corners):
             ratio_h = (in_h - 1.0) / (out_h - 1.0)
         else:
-            ratio_h = 1.0 * in_h / out_h
+            if scale_h > 0:
+                ratio_h = 1.0 / scale_h
+            else:
+                ratio_h = 1.0 * in_h / out_h
 
     if out_w > 1:
         if (align_corners):
             ratio_w = (in_w - 1.0) / (out_w - 1.0)
         else:
-            ratio_w = 1.0 * in_w / out_w
+            if scale_w > 0:
+                ratio_w = 1.0 / scale_w
+            else:
+                ratio_w = 1.0 * in_w / out_w
 
     out = np.zeros((batch_size, channel, out_h, out_w))
 
@@ -128,7 +136,8 @@ class TestBicubicInterpOp(OpTest):
         self.init_test_case()
         self.op_type = "bicubic_interp_v2"
         input_np = np.random.random(self.input_shape).astype("float64")
-
+        scale_h = 0
+        scale_w = 0
         if self.data_layout == "NCHW":
             in_h = self.input_shape[2]
             in_w = self.input_shape[3]
@@ -151,9 +160,9 @@ class TestBicubicInterpOp(OpTest):
             out_h = self.out_h
             out_w = self.out_w
 
-        output_np = bicubic_interp_np(input_np, out_h, out_w, self.out_size,
-                                      self.actual_shape, self.align_corners,
-                                      self.data_layout)
+        output_np = bicubic_interp_np(input_np, out_h, out_w, scale_h, scale_w,
+                                      self.out_size, self.actual_shape,
+                                      self.align_corners, self.data_layout)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -480,10 +489,34 @@ class TestBicubicOpError(unittest.TestCase):
                 out = interpolate(
                     x,
                     size=None,
-                    mode='trilinear',
+                    mode='bicubic',
                     align_corners=False,
                     scale_factor=[1, 2, 2])
 
+            def test_size_and_scale():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x,
+                    size=None,
+                    mode='bicubic',
+                    align_corners=False,
+                    scale_factor=None)
+
+            def test_size_and_scale2():
+                x = fluid.data(
+                    name="input", shape=[2, 3, 6, 9, 4], dtype="float32")
+                out = interpolate(
+                    x,
+                    size=[2, 2, 2],
+                    mode='trilinear',
+                    align_corners=False,
+                    scale_factor=2.0)
+
+            def test_size_type():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x, size={2, 2}, mode='bicubic', align_corners=False)
+
             self.assertRaises(ValueError, test_mode_type)
             self.assertRaises(ValueError, test_input_shape)
             self.assertRaises(TypeError, test_align_corcers)
@@ -498,6 +531,9 @@ class TestBicubicOpError(unittest.TestCase):
             self.assertRaises(ValueError, test_align_corners_and_nearest)
             self.assertRaises(ValueError, test_scale_shape)
             self.assertRaises(ValueError, test_scale_value)
+            self.assertRaises(ValueError, test_size_and_scale)
+            self.assertRaises(ValueError, test_size_and_scale2)
+            self.assertRaises(TypeError, test_size_type)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
index d139a53c7e2ccc68964457f3142b4ed890d339f2..9fc4971fec23923a40080613612d3a1843a86d2e 100755
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
@@ -26,6 +26,8 @@ import paddle
 def bilinear_interp_np(input,
                        out_h,
                        out_w,
+                       scale_w=0,
+                       scale_h=0,
                        out_size=None,
                        actual_shape=None,
                        align_corners=True,
@@ -47,12 +49,18 @@ def bilinear_interp_np(input,
         if (align_corners):
             ratio_h = (in_h - 1.0) / (out_h - 1.0)
         else:
-            ratio_h = 1.0 * in_h / out_h
+            if scale_h > 0:
+                ratio_h = 1.0 / scale_h
+            else:
+                ratio_h = 1.0 * in_h / out_h
     if out_w > 1:
         if (align_corners):
             ratio_w = (in_w - 1.0) / (out_w - 1.0)
         else:
-            ratio_w = 1.0 * in_w / out_w
+            if scale_w > 0:
+                ratio_w = 1.0 / scale_w
+            else:
+                ratio_w = 1.0 * in_w / out_w
 
     out = np.zeros((batch_size, channel, out_h, out_w))
 
@@ -110,7 +118,8 @@ class TestBilinearInterpOp(OpTest):
         else:
             in_h = self.input_shape[1]
             in_w = self.input_shape[2]
-
+        scale_h = 0
+        scale_w = 0
         if self.scale:
             if isinstance(self.scale, float) or isinstance(self.scale, int):
                 if self.scale > 0.:
@@ -126,9 +135,9 @@ class TestBilinearInterpOp(OpTest):
             out_h = self.out_h
             out_w = self.out_w
 
-        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
-                                       self.actual_shape, self.align_corners,
-                                       self.align_mode, self.data_layout)
+        output_np = bilinear_interp_np(
+            input_np, out_h, out_w, 0, 0, self.out_size, self.actual_shape,
+            self.align_corners, self.align_mode, self.data_layout)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -238,6 +247,17 @@ class TestBilinearInterpCase6(TestBilinearInterpOp):
         self.align_mode = 1
 
 
+class TestBilinearInterpCase7(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = [2.0, 0.5]
+        self.align_corners = False
+        self.align_mode = 1
+
+
 class TestBilinearInterpSame(TestBilinearInterpOp):
     def init_test_case(self):
         self.interp_method = 'bilinear'
@@ -298,9 +318,9 @@ class TestBilinearInterpOpUint8(OpTest):
             out_h = self.out_h
             out_w = self.out_w
 
-        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
-                                       self.actual_shape, self.align_corners,
-                                       self.align_mode)
+        output_np = bilinear_interp_np(input_np, out_h, out_w, 0, 0,
+                                       self.out_size, self.actual_shape,
+                                       self.align_corners, self.align_mode)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -481,8 +501,9 @@ class TestBilinearInterpOp_attr_tensor(OpTest):
             if isinstance(self.scale, list) and len(self.scale) == 1:
                 self.scale = [self.scale[0], self.scale[0]]
             self.attrs['scale'] = self.scale
-        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
-                                       self.actual_shape, self.align_corners)
+        output_np = bilinear_interp_np(input_np, out_h, out_w, 0, 0,
+                                       self.out_size, self.actual_shape,
+                                       self.align_corners)
         self.outputs = {'Out': output_np}
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/test_broadcast_error.py b/python/paddle/fluid/tests/unittests/test_broadcast_error.py
new file mode 100644
index 0000000000000000000000000000000000000000..517de67fd6dddf1d0a74df6ffed659720862b20c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_broadcast_error.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+
+
+class TestBroadcastOpCpu(OpTest):
+    def setUp(self):
+        self.op_type = "broadcast"
+        input = np.random.random((100, 2)).astype("float32")
+        np_out = input[:]
+        self.inputs = {"X": input}
+        self.attrs = {"sync_mode": False, "root": 0}
+        self.outputs = {"Out": np_out}
+
+    def test_check_output_cpu(self):
+        try:
+            self.check_output_with_place(place=core.CPUPlace())
+        except:
+            print("do not support cpu test, skip")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_calc_gradient.py b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
index 3e8c449d8995ca90401861e93f2fb987d1c6967d..fdfaf6a3113bbb9a50a79de7ef4ac4c3251d5759 100644
--- a/python/paddle/fluid/tests/unittests/test_calc_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 import unittest
-
+import numpy as np
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid.backward import calc_gradient
@@ -81,5 +81,22 @@ class TestDoubleGrad(unittest.TestCase):
         self.assertEqual(12, out[0])
 
 
+class TestGradientWithPrune(unittest.TestCase):
+    def test_prune(self):
+        x = fluid.data(name='x', shape=[3], dtype='float32')
+        x.stop_gradient = False
+        x1, x2, x3 = fluid.layers.split(x, dim=0, num_or_sections=3)
+        y = x1 * 2
+        x1_grad = fluid.gradients(y, x)
+
+        exe = fluid.Executor(fluid.CPUPlace())
+        main = fluid.default_main_program()
+        exe.run(fluid.default_startup_program())
+        out = exe.run(main,
+                      feed={'x': np.ones([3]).astype('float32')},
+                      fetch_list=[x1_grad])
+        self.assertTrue(np.array_equal(out[0], [2., 0., 0.]))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collect_fpn_proposals_op.py b/python/paddle/fluid/tests/unittests/test_collect_fpn_proposals_op.py
index 034bb7f8dc7e00a321b6c6a5a4776fa4f7398ab5..a2f56c428012c615dcf55b6832a54ca473771d08 100644
--- a/python/paddle/fluid/tests/unittests/test_collect_fpn_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_collect_fpn_proposals_op.py
@@ -33,10 +33,14 @@ class TestCollectFPNProposalstOp(OpTest):
                     for i in range(self.num_level)]
         self.inputs = {
             'MultiLevelRois': inputs_x,
-            "MultiLevelScores": self.scores_input
+            "MultiLevelScores": self.scores_input,
+            'MultiLevelRoIsNum': []
         }
         self.attrs = {'post_nms_topN': self.post_nms_top_n, }
-        self.outputs = {'FpnRois': (self.rois, [self.lod])}
+        self.outputs = {
+            'FpnRois': (self.rois, [self.lod]),
+            'RoisNum': np.array(self.lod).astype('int32')
+        }
 
     def init_test_case(self):
         self.post_nms_top_n = 20
@@ -96,5 +100,32 @@ class TestCollectFPNProposalstOp(OpTest):
         self.check_output(check_dygraph=False)
 
 
+class TestCollectFPNProposalstOpWithRoisNum(TestCollectFPNProposalstOp):
+    def set_data(self):
+        self.init_test_case()
+        self.make_rois()
+        self.scores_input = [('y%d' % i,
+                              (self.scores[i].reshape(-1, 1), self.rois_lod[i]))
+                             for i in range(self.num_level)]
+        self.rois, self.lod = self.calc_rois_collect()
+        inputs_x = [('x%d' % i, (self.roi_inputs[i][:, 1:], self.rois_lod[i]))
+                    for i in range(self.num_level)]
+        rois_num_per_level = [
+            ('rois%d' % i, np.array(self.rois_lod[i][0]).astype('int32'))
+            for i in range(self.num_level)
+        ]
+
+        self.inputs = {
+            'MultiLevelRois': inputs_x,
+            "MultiLevelScores": self.scores_input,
+            'MultiLevelRoIsNum': rois_num_per_level
+        }
+        self.attrs = {'post_nms_topN': self.post_nms_top_n, }
+        self.outputs = {
+            'FpnRois': (self.rois, [self.lod]),
+            'RoisNum': np.array(self.lod).astype('int32')
+        }
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py
index cfad50409802d4f3d35c9da3b22597c681da91b1..25ae65aa7c968b2e6f1f9429d1a4e4e618fe7033 100644
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
@@ -38,6 +38,7 @@ def create_test_class(op_type, typename, callback):
             self.check_output()
 
         def test_errors(self):
+            paddle.enable_static()
             with program_guard(Program(), Program()):
                 x = fluid.layers.data(name='x', shape=[2], dtype='int32')
                 y = fluid.layers.data(name='y', shape=[2], dtype='int32')
@@ -80,6 +81,7 @@ def create_paddle_case(op_type, callback):
                 self.place = paddle.CUDAPlace(0)
 
         def test_api(self):
+            paddle.enable_static()
             with program_guard(Program(), Program()):
                 x = fluid.data(name='x', shape=[4], dtype='int64')
                 y = fluid.data(name='y', shape=[4], dtype='int64')
@@ -92,6 +94,7 @@ def create_paddle_case(op_type, callback):
             self.assertEqual((res == self.real_result).all(), True)
 
         def test_broadcast_api_1(self):
+            paddle.enable_static()
             with program_guard(Program(), Program()):
                 x = paddle.static.data(
                     name='x', shape=[1, 2, 1, 3], dtype='int32')
@@ -108,6 +111,7 @@ def create_paddle_case(op_type, callback):
             self.assertEqual((res == real_result).all(), True)
 
         def test_attr_name(self):
+            paddle.enable_static()
             with program_guard(Program(), Program()):
                 x = fluid.layers.data(name='x', shape=[4], dtype='int32')
                 y = fluid.layers.data(name='y', shape=[4], dtype='int32')
@@ -130,6 +134,7 @@ create_paddle_case('not_equal', lambda _a, _b: _a != _b)
 
 class TestCompareOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program(), Program()):
             # The input x and y of compare_op must be Variable.
             x = fluid.layers.data(name='x', shape=[1], dtype="float32")
@@ -140,6 +145,7 @@ class TestCompareOpError(unittest.TestCase):
 
 class API_TestElementwise_Equal(unittest.TestCase):
     def test_api(self):
+        paddle.enable_static()
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             label = fluid.layers.assign(np.array([3, 3], dtype="int32"))
             limit = fluid.layers.assign(np.array([3, 2], dtype="int32"))
@@ -159,5 +165,31 @@ class API_TestElementwise_Equal(unittest.TestCase):
         self.assertEqual((res == np.array([True, True])).all(), True)
 
 
+class TestCompareOpPlace(unittest.TestCase):
+    def test_place_1(self):
+        paddle.enable_static()
+        place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        label = fluid.layers.assign(np.array([3, 3], dtype="int32"))
+        limit = fluid.layers.assign(np.array([3, 2], dtype="int32"))
+        out = fluid.layers.less_than(label, limit, force_cpu=True)
+        exe = fluid.Executor(place)
+        res, = exe.run(fetch_list=[out])
+        self.assertEqual((res == np.array([False, False])).all(), True)
+
+    def test_place_2(self):
+        place = paddle.CPUPlace()
+        data_place = place
+        if core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+            data_place = paddle.CUDAPinnedPlace()
+        paddle.disable_static(place)
+        data = np.array([9], dtype="int64")
+        data_tensor = paddle.to_tensor(data, place=data_place)
+        result = data_tensor == 0
+        self.assertEqual((result.numpy() == np.array([False])).all(), True)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py
index 582bb3dcc681921cdbf2111dcd26b299f06a3058..208956b825ed1d78aeacf85fc052210e42d247ce 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset.py
@@ -38,26 +38,22 @@ class TestDataset(unittest.TestCase):
     def test_dataset_create(self):
         """ Testcase for dataset create. """
         try:
-            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-                "InMemoryDataset")
+            dataset = paddle.distributed.InMemoryDataset()
         except:
             self.assertTrue(False)
 
         try:
-            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-                "QueueDataset")
+            dataset = paddle.distributed.QueueDataset()
         except:
             self.assertTrue(False)
 
         try:
-            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-                "FileInstantDataset")
+            dataset = paddle.distributed.fleet.dataset.FileInstantDataset()
         except:
             self.assertTrue(False)
 
         try:
-            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-                "MyOwnDataset")
+            dataset = paddle.distributed.fleet.dataset.MyOwnDataset()
             self.assertTrue(False)
         except:
             self.assertTrue(True)
@@ -95,18 +91,18 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset.init(
+            batch_size=32, thread_num=3, pipe_command="cat", use_var=slots_vars)
+        dataset.update_settings(pipe_command="cat1")
+        dataset._init_distributed_settings(
+            parse_ins_id=True,
+            parse_content=True,
+            fea_eval=True,
+            candidate_size=10000)
         dataset.set_filelist(
             ["test_run_with_dump_a.txt", "test_run_with_dump_b.txt"])
-        dataset.set_parse_ins_id(True)
-        dataset.set_parse_content(True)
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
         dataset.load_into_memory()
-        dataset.set_fea_eval(10000, True)
         dataset.local_shuffle()
 
         exe = fluid.Executor(fluid.CPUPlace())
@@ -176,14 +172,14 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset.init(
+            batch_size=32,
+            thread_num=3,
+            pipe_command="cat",
+            download_cmd="cat",
+            use_var=slots_vars)
         dataset.set_filelist([filename1, filename2])
-        dataset.set_pipe_command("cat")
-        dataset.set_download_cmd("cat")
-        dataset.set_use_var(slots_vars)
         dataset.load_into_memory()
         exe = fluid.Executor(fluid.CPUPlace())
         exe.run(fluid.default_startup_program())
@@ -228,22 +224,19 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset.init(
+            batch_size=32, thread_num=3, pipe_command="cat", use_var=slots_vars)
+        dataset._init_distributed_settings(fea_eval=True, candidate_size=1)
         dataset.set_filelist([
             "test_in_memory_dataset_run_a.txt",
             "test_in_memory_dataset_run_b.txt"
         ])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
         dataset.load_into_memory()
-        dataset.set_fea_eval(1, True)
         dataset.slots_shuffle(["slot1"])
         dataset.local_shuffle()
-        dataset.set_generate_unique_feasigns(True, 15)
-        dataset.generate_local_tables_unlock(0, 11, 1, 25, 15)
+        dataset._set_generate_unique_feasigns(True, 15)
+        dataset._generate_local_tables_unlock(0, 11, 1, 25, 15)
         exe = fluid.Executor(fluid.CPUPlace())
         exe.run(fluid.default_startup_program())
         if self.use_data_loader:
@@ -300,17 +293,14 @@ class TestDataset(unittest.TestCase):
                     name=slot, shape=[1], dtype="float32", lod_level=1)
                 slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(1)
-        dataset.set_parse_ins_id(True)
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset.init(
+            batch_size=32, thread_num=1, pipe_command="cat", use_var=slots_vars)
+        dataset._init_distributed_settings(parse_ins_id=True)
         dataset.set_filelist([
             "test_in_memory_dataset_masterpatch_a.txt",
             "test_in_memory_dataset_masterpatch_b.txt"
         ])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
         dataset.load_into_memory()
         dataset.local_shuffle()
 
@@ -325,7 +315,8 @@ class TestDataset(unittest.TestCase):
             except Exception as e:
                 self.assertTrue(False)
 
-        dataset.set_merge_by_lineid(2)
+        #dataset._set_merge_by_lineid(2)
+        dataset.update_settings(merge_size=2)
         dataset.dataset.merge_by_lineid()
 
         os.remove("./test_in_memory_dataset_masterpatch_a.txt")
@@ -367,17 +358,14 @@ class TestDataset(unittest.TestCase):
                 name="slot4", shape=[1], dtype="float32", lod_level=0)
             slots_vars = [var1, var2, var3, var4]
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(1)
-        dataset.set_parse_ins_id(True)
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset.init(
+            batch_size=32, thread_num=1, pipe_command="cat", use_var=slots_vars)
+        dataset._init_distributed_settings(parse_ins_id=True)
         dataset.set_filelist([
             "test_in_memory_dataset_masterpatch1_a.txt",
             "test_in_memory_dataset_masterpatch1_b.txt"
         ])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
         dataset.load_into_memory()
         dataset.local_shuffle()
 
@@ -392,7 +380,7 @@ class TestDataset(unittest.TestCase):
             except Exception as e:
                 self.assertTrue(False)
 
-        dataset.set_merge_by_lineid(2)
+        dataset._set_merge_by_lineid(2)
         dataset.dataset.merge_by_lineid()
 
         os.remove("./test_in_memory_dataset_masterpatch1_a.txt")
@@ -423,16 +411,13 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="float32", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset.init(
+            batch_size=32, thread_num=3, pipe_command="cat", use_var=slots_vars)
         dataset.set_filelist([
             "test_in_memory_dataset_run_a.txt",
             "test_in_memory_dataset_run_b.txt"
         ])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
         dataset.load_into_memory()
         dataset.local_shuffle()
 
@@ -473,9 +458,9 @@ class TestDataset(unittest.TestCase):
                 except Exception as e:
                     self.assertTrue(False)
 
-        dataset.set_merge_by_lineid(2)
-        dataset.set_parse_ins_id(False)
-        dataset.set_fleet_send_sleep_seconds(2)
+        dataset._set_merge_by_lineid(2)
+        dataset._set_parse_ins_id(False)
+        dataset._set_fleet_send_sleep_seconds(2)
         dataset.preload_into_memory()
         dataset.wait_preload_done()
         dataset.release_memory()
@@ -483,10 +468,25 @@ class TestDataset(unittest.TestCase):
         dataset.wait_preload_done()
         dataset.dataset.merge_by_lineid()
         dataset.release_memory()
-        dataset.set_merge_by_lineid(30)
-        dataset.set_parse_ins_id(False)
+        dataset._set_merge_by_lineid(30)
+        dataset._set_parse_ins_id(False)
         dataset.load_into_memory()
         dataset.dataset.merge_by_lineid()
+        dataset.update_settings(
+            batch_size=1,
+            thread_num=2,
+            input_type=1,
+            pipe_command="cat",
+            use_var=[],
+            fs_name="",
+            fs_ugi="",
+            download_cmd="cat",
+            merge_size=-1,
+            parse_ins_id=False,
+            parse_content=False,
+            fleet_send_batch_size=2,
+            fleet_send_sleep_seconds=2,
+            fea_eval=True)
         fleet_ptr = fluid.core.Fleet()
         fleet_ptr.set_client2client_config(1, 1, 1)
         fleet_ptr.get_cache_threshold(0)
@@ -517,14 +517,11 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "QueueDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
+        dataset = paddle.distributed.QueueDataset()
+        dataset.init(
+            batch_size=32, thread_num=3, pipe_command="cat", use_var=slots_vars)
         dataset.set_filelist(
             ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
 
         exe = fluid.Executor(fluid.CPUPlace())
         exe.run(fluid.default_startup_program())
@@ -543,12 +540,9 @@ class TestDataset(unittest.TestCase):
                 except Exception as e:
                     self.assertTrue(False)
 
-        dataset2 = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "QueueDataset")
-        dataset2.set_use_var(slots_vars)
-        dataset2.set_batch_size(32)
-        dataset2.set_thread(3)
-        dataset2.set_pipe_command("cat")
+        dataset2 = paddle.distributed.QueueDataset()
+        dataset2.init(
+            batch_size=32, thread_num=3, pipe_command="cat", use_var=slots_vars)
         dataset.set_filelist([])
         try:
             exe.train_from_dataset(fluid.default_main_program(), dataset2)
@@ -585,14 +579,11 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="float32", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "QueueDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
+        dataset = paddle.distributed.QueueDataset()
+        dataset.init(
+            batch_size=32, thread_num=3, pipe_command="cat", use_var=slots_vars)
         dataset.set_filelist(
             ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
 
         exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda(
         ) else fluid.CUDAPlace(0))
@@ -641,15 +632,15 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[None, 1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
-        dataset.set_input_type(1)
-        dataset.set_batch_size(1)
-        dataset.set_thread(2)
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset.init(
+            batch_size=1,
+            thread_num=2,
+            input_type=1,
+            pipe_command="cat",
+            use_var=slots_vars)
         dataset.set_filelist(
             ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
         dataset.load_into_memory()
 
         exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda(
@@ -721,13 +712,10 @@ class TestDatasetWithFetchHandler(unittest.TestCase):
             inputs(list): inputs of get_dataset
             files(list): files of  get_dataset
         """
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "QueueDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
+        dataset = paddle.distributed.QueueDataset()
+        dataset.init(
+            batch_size=32, thread_num=3, pipe_command="cat", use_var=inputs)
         dataset.set_filelist(files)
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(inputs)
         return dataset
 
     def setUp(self):
@@ -879,16 +867,17 @@ class TestDataset2(unittest.TestCase):
             except ImportError as e:
                 print("warning: no mpi4py")
             exe.run(startup_program)
-            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-                "InMemoryDataset")
-            dataset.set_batch_size(32)
-            dataset.set_thread(3)
+            dataset = paddle.distributed.InMemoryDataset()
+
+            dataset.init(
+                batch_size=32,
+                thread_num=3,
+                pipe_command="cat",
+                use_var=slots_vars)
             dataset.set_filelist([
                 "test_in_memory_dataset2_run_a.txt",
                 "test_in_memory_dataset2_run_b.txt"
             ])
-            dataset.set_pipe_command("cat")
-            dataset.set_use_var(slots_vars)
             dataset.load_into_memory()
             fleet._opt_info = None
             fleet._fleet_ptr = None
@@ -949,16 +938,16 @@ class TestDataset2(unittest.TestCase):
             except ImportError as e:
                 print("warning: no mpi4py")
             exe.run(startup_program)
-            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-                "InMemoryDataset")
-            dataset.set_batch_size(32)
-            dataset.set_thread(3)
+            dataset = paddle.distributed.InMemoryDataset()
+            dataset.init(
+                batch_size=32,
+                thread_num=3,
+                pipe_command="cat",
+                use_var=slots_vars)
             dataset.set_filelist([
                 "test_in_memory_dataset2_run2_a.txt",
                 "test_in_memory_dataset2_run2_b.txt"
             ])
-            dataset.set_pipe_command("cat")
-            dataset.set_use_var(slots_vars)
             dataset.load_into_memory()
             try:
                 dataset.global_shuffle(fleet)
@@ -966,14 +955,11 @@ class TestDataset2(unittest.TestCase):
                 print("warning: catch expected error")
             fleet._opt_info = None
             fleet._fleet_ptr = None
-            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-                "InMemoryDataset")
-            dataset.set_rank_offset("")
-            dataset.set_pv_batch_size(1)
-            dataset.set_hdfs_config("", "")
+            dataset = paddle.distributed.InMemoryDataset()
+            dataset.init(fs_name="", fs_ugi="")
             d = paddle.distributed.fleet.DatasetBase()
             try:
-                dataset.set_feed_type("MultiSlotInMemoryDataFeed")
+                dataset._set_feed_type("MultiSlotInMemoryDataFeed")
             except:
                 print("warning: catch expected error")
             dataset.thread_num = 0
@@ -981,9 +967,6 @@ class TestDataset2(unittest.TestCase):
                 dataset._prepare_to_run()
             except:
                 print("warning: catch expected error")
-            dataset.set_parse_logkey(True)
-            dataset.set_merge_by_sid(True)
-            dataset.set_enable_pv_merge(True)
             try:
                 dataset.preprocess_instance()
             except:
@@ -996,16 +979,15 @@ class TestDataset2(unittest.TestCase):
                 dataset.postprocess_instance()
             except:
                 print("warning: catch expected error")
-            dataset.set_fleet_send_batch_size(1024)
+            dataset._set_fleet_send_batch_size(1024)
             try:
                 dataset.global_shuffle()
             except:
                 print("warning: catch expected error")
-            dataset.get_pv_data_size()
+            #dataset.get_pv_data_size()
             dataset.get_memory_data_size()
             dataset.get_shuffle_data_size()
-            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-                "QueueDataset")
+            dataset = paddle.distributed.QueueDataset()
             try:
                 dataset.local_shuffle()
             except:
@@ -1027,6 +1009,120 @@ class TestDataset2(unittest.TestCase):
         os.remove("./test_in_memory_dataset2_run2_a.txt")
         os.remove("./test_in_memory_dataset2_run2_b.txt")
 
+    def test_bosps_dataset_fleet2(self):
+        """
+        Testcase for InMemoryDataset from create to run.
+        """
+        with open("test_in_memory_dataset2_run2_a.txt", "w") as f:
+            data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
+            data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
+            data += "1 3 2 3 5 4 7 7 7 7 1 3\n"
+            f.write(data)
+        with open("test_in_memory_dataset2_run2_b.txt", "w") as f:
+            data = "1 4 2 3 3 4 5 5 5 5 1 4\n"
+            data += "1 5 2 3 4 4 6 6 6 6 1 5\n"
+            data += "1 6 2 3 5 4 7 7 7 7 1 6\n"
+            data += "1 7 2 3 6 4 8 8 8 8 1 7\n"
+            f.write(data)
+
+        train_program = fluid.Program()
+        startup_program = fluid.Program()
+        scope = fluid.Scope()
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+        with fluid.program_guard(train_program, startup_program):
+            slots = ["slot1_ff", "slot2_ff", "slot3_ff", "slot4_ff"]
+            slots_vars = []
+            for slot in slots:
+                var = fluid.layers.data(\
+                    name=slot, shape=[1], dtype="float32", lod_level=1)
+                slots_vars.append(var)
+            fake_cost = \
+                fluid.layers.elementwise_sub(slots_vars[0], slots_vars[-1])
+            fake_cost = fluid.layers.mean(fake_cost)
+        with fluid.scope_guard(scope):
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            try:
+                fleet.init()
+            except ImportError as e:
+                print("warning: no mpi4py")
+            adam = fluid.optimizer.Adam(learning_rate=0.000005)
+            try:
+                adam = fleet.distributed_optimizer(
+                    adam,
+                    strategy={
+                        "fs_uri": "fs_uri_xxx",
+                        "fs_user": "fs_user_xxx",
+                        "fs_passwd": "fs_passwd_xxx",
+                        "fs_hadoop_bin": "fs_hadoop_bin_xxx"
+                    })
+                adam.minimize([fake_cost], [scope])
+            except AttributeError as e:
+                print("warning: no mpi")
+            except ImportError as e:
+                print("warning: no mpi4py")
+            exe.run(startup_program)
+            dataset = paddle.distributed.fleet.BoxPSDataset()
+            dataset.init(
+                batch_size=32,
+                thread_num=3,
+                pipe_command="cat",
+                use_var=slots_vars)
+            dataset.set_filelist([
+                "test_in_memory_dataset2_run2_a.txt",
+                "test_in_memory_dataset2_run2_b.txt"
+            ])
+            dataset.load_into_memory()
+            try:
+                dataset.global_shuffle(fleet)
+            except:
+                print("warning: catch expected error")
+            fleet._opt_info = None
+            fleet._fleet_ptr = None
+            dataset = paddle.distributed.fleet.BoxPSDataset()
+            dataset.init(
+                rank_offset="",
+                pv_batch_size=1,
+                fs_name="",
+                fs_ugi="",
+                data_feed_type="MultiSlotInMemoryDataFeed",
+                parse_logkey=True,
+                merge_by_sid=True,
+                enable_pv_merge=True)
+            d = paddle.distributed.fleet.DatasetBase()
+            try:
+                dataset._set_feed_type("MultiSlotInMemoryDataFeed")
+            except:
+                print("warning: catch expected error")
+            dataset.thread_num = 0
+            try:
+                dataset._prepare_to_run()
+            except:
+                print("warning: catch expected error")
+            dataset._set_parse_logkey(True)
+            dataset._set_merge_by_sid(True)
+            dataset._set_enable_pv_merge(True)
+            try:
+                dataset.preprocess_instance()
+            except:
+                print("warning: catch expected error")
+            try:
+                dataset.set_current_phase(1)
+            except:
+                print("warning: catch expected error")
+            try:
+                dataset.postprocess_instance()
+            except:
+                print("warning: catch expected error")
+            dataset._set_fleet_send_batch_size(1024)
+            try:
+                dataset.global_shuffle()
+            except:
+                print("warning: catch expected error")
+            #dataset.get_pv_data_size()
+            dataset.get_memory_data_size()
+            dataset.get_shuffle_data_size()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py b/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
index c13c33f209f0f7d0fff95bdfb5b4e551a145b87e..9195ac277b93ade31b50682a4c3553c3664093f3 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
@@ -97,9 +97,11 @@ class DatasetLoaderTestBase(unittest.TestCase):
 
     def check_batch_number(self, place, randomize_batch_num=False):
         main_prog, startup_prog, feeds = self.build_network()
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            self.dataset_name)
-        dataset.set_batch_size(BATCH_SIZE)
+        if self.dataset_name == "QueueDataset":
+            dataset = paddle.distributed.QueueDataset()
+        else:
+            dataset = paddle.distributed.InMemoryDataset()
+        dataset._set_batch_size(BATCH_SIZE)
 
         if isinstance(place, fluid.CPUPlace):
             file_num = 10
@@ -128,8 +130,8 @@ class DatasetLoaderTestBase(unittest.TestCase):
                 fake_reader(batch_num=BATCH_NUM + random_delta_batch_size[i]))
 
         dataset.set_filelist(filelist)
-        dataset.set_use_var(feeds)
-        dataset.set_pipe_command("cat")
+        dataset._set_use_var(feeds)
+        dataset._set_pipe_command("cat")
         if self.dataset_name == 'InMemoryDataset':
             dataset.load_into_memory()
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a5d8afc55bac4c0ea862e75b728c6c1a37b3188
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+import paddle.distributed.fleet.base.role_maker as role_maker
+import time
+
+
+class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+            "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_a_sync_optimizer1(self):
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        import paddle.distributed.fleet as fleet
+
+        main_program = paddle.fluid.Program()
+        startup_program = paddle.fluid.Program()
+
+        paddle.fluid.framework.switch_main_program(main_program)
+        paddle.fluid.framework.switch_startup_program(startup_program)
+
+        fleet.init(role_maker.PaddleCloudRoleMaker())
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.auto = True
+        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        self.assertTrue(optimizer.user_defined_strategy.a_sync)
+        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
+        self.assertTrue(a_sync_configs['k_steps'] == 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
new file mode 100644
index 0000000000000000000000000000000000000000..9085556c04c356e5b703ec0b36c3884100ad73f8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+import paddle.distributed.fleet.base.role_maker as role_maker
+import time
+
+
+class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+            "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_a_sync_optimizer3(self):
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        import paddle.distributed.fleet as fleet
+
+        main_program = paddle.fluid.Program()
+        startup_program = paddle.fluid.Program()
+
+        paddle.fluid.framework.switch_main_program(main_program)
+        paddle.fluid.framework.switch_startup_program(startup_program)
+
+        fleet.init(role_maker.PaddleCloudRoleMaker())
+        input_x = paddle.fluid.layers.data(
+            name="x",
+            shape=[-1, 1],
+            dtype="int64",
+            lod_level=1,
+            append_batch_size=False)
+        x_embedding = paddle.fluid.layers.embedding(
+            is_distributed=False,
+            input=input_x,
+            size=[1000000000, 100000],
+            param_attr=paddle.fluid.ParamAttr(
+                name="embedding",
+                initializer=paddle.fluid.initializer.Constant(value=0.01)),
+            is_sparse=True)
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=x_embedding, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.auto = True
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        self.assertTrue(optimizer.user_defined_strategy.a_sync)
+        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
+        self.assertTrue(a_sync_configs['k_steps'] == 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
new file mode 100644
index 0000000000000000000000000000000000000000..4787d048bd2566fe063073867bcbd4138d25ff21
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+import paddle.distributed.fleet.base.role_maker as role_maker
+import time
+
+
+class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+            "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_a_sync_optimizer2(self):
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        import paddle.distributed.fleet as fleet
+
+        main_program = paddle.fluid.Program()
+        startup_program = paddle.fluid.Program()
+
+        paddle.fluid.framework.switch_main_program(main_program)
+        paddle.fluid.framework.switch_startup_program(startup_program)
+
+        fleet.init(role_maker.PaddleCloudRoleMaker())
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.auto = True
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        self.assertTrue(optimizer.user_defined_strategy.a_sync)
+        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
+        self.assertTrue(a_sync_configs['k_steps'] == 800)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
index beb0069eb770f25d7834749ff9c188e5252e13c0..3a923dbf3f72e28c64c3f01d22d4d6f2d897f79b 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
@@ -76,9 +76,10 @@ class FleetDistRunnerBase(object):
         return role
 
     def build_strategy(self, args):
-        self.strategy = paddle.distributed.fleet.DistributedStrategy()
-        self.strategy.a_sync = False
-        if args.mode == "async":
+        if args.mode == "sync":
+            self.strategy = paddle.distributed.fleet.DistributedStrategy()
+            self.strategy.a_sync = False
+        elif args.mode == "async":
             self.strategy = paddle.distributed.fleet.DistributedStrategy()
             self.strategy.a_sync = True
         elif args.mode == "geo":
@@ -87,6 +88,10 @@ class FleetDistRunnerBase(object):
             self.strategy.a_sync_configs = {
                 "k_steps": args.geo_sgd_need_push_nums
             }
+        elif args.mode == "auto":
+            self.strategy = paddle.distributed.fleet.DistributedStrategy()
+            self.strategy.auto = True
+
         self.dump_param = os.getenv("dump_param", "").split(",")
         self.dump_fields = os.getenv("dump_fields", "").split(",")
         self.dump_fields_path = os.getenv("dump_fields_path", "")
@@ -232,14 +237,17 @@ class TestFleetBase(unittest.TestCase):
         tr0_pipe = open(tempfile.gettempdir() + "/tr0_err.log", "wb+")
         tr1_pipe = open(tempfile.gettempdir() + "/tr1_err.log", "wb+")
 
+        tr0_out = open(tempfile.gettempdir() + "/tr0_stdout.log", "wb+")
+        tr1_out = open(tempfile.gettempdir() + "/tr1_stdout.log", "wb+")
+
         tr0_proc = subprocess.Popen(
             tr0_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
+            stdout=tr0_out,
             stderr=tr0_pipe,
             env=required_envs)
         tr1_proc = subprocess.Popen(
             tr1_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
+            stdout=tr1_out,
             stderr=tr1_pipe,
             env=required_envs)
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
index e2336caac1c07f555280b82ba8fcfa7e5ec7f5b8..02ca0588e7452d44817f6c288ea9cf77b80dbfe8 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
@@ -52,6 +52,38 @@ class TestDistMnistSync2x2(TestFleetBase):
             "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
 
 
+class TestDistMnistAuto2x2(TestFleetBase):
+    def _setup_config(self):
+        self._mode = "auto"
+        self._reader = "pyreader"
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
+            "http_proxy": "",
+            "CPU_NUM": "2"
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
+
+    def test_dist_train(self):
+        self.check_with_place(
+            "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
+
+
 class TestDistMnistAsync2x2(TestFleetBase):
     def _setup_config(self):
         self._mode = "async"
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
index 1062123948481a4164a12a4bed818b964923006f..761d57408b9a8f9e52419331bfb0bca5b0135c30 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
@@ -113,8 +113,8 @@ class TranspilerAsyncLRDecayTest(unittest.TestCase):
                          ["listen_and_serv"])
         # block1: sum,cast,scale,floor,fill_constant,elementwise_pow,scale
         self.assertEqual([op.type for op in pserver.blocks[1].ops], [
-            "sum", "cast", "fill_constant", "elementwise_div", "floor",
-            "fill_constant", "elementwise_pow", "scale"
+            "sum", "cast", "scale", "floor", "fill_constant", "elementwise_pow",
+            "scale"
         ])
 
         # block1~2: optimize pass
diff --git a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
index 55b21f1a722f822f1bfcb7bbbda645109092b8a3..ec0125b28ed1b870025adbfd2bd4ba78244bcc11 100644
--- a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
@@ -35,9 +35,10 @@ class TestDistributeFPNProposalsOp(OpTest):
         }
         output = [('out%d' % i, self.rois_fpn[i])
                   for i in range(len(self.rois_fpn))]
+
         self.outputs = {
             'MultiFpnRois': output,
-            'RestoreIndex': self.rois_idx_restore.reshape(-1, 1)
+            'RestoreIndex': self.rois_idx_restore.reshape(-1, 1),
         }
 
     def init_test_case(self):
@@ -117,5 +118,34 @@ class TestDistributeFPNProposalsOp(OpTest):
         self.check_output()
 
 
+class TestDistributeFPNProposalsOpWithRoisNum(TestDistributeFPNProposalsOp):
+    def set_data(self):
+        self.init_test_case()
+        self.make_rois()
+        self.rois_fpn, self.rois_idx_restore = self.calc_rois_distribute()
+        self.inputs = {
+            'FpnRois': (self.rois[:, 1:5], self.rois_lod),
+            'RoisNum': np.array(self.rois_lod[0]).astype('int32')
+        }
+        self.attrs = {
+            'max_level': self.roi_max_level,
+            'min_level': self.roi_min_level,
+            'refer_scale': self.canonical_scale,
+            'refer_level': self.canonical_level
+        }
+        output = [('out%d' % i, self.rois_fpn[i])
+                  for i in range(len(self.rois_fpn))]
+        rois_num_per_level = [
+            ('rois_num%d' % i, np.array(self.rois_fpn[i][1][0]).astype('int32'))
+            for i in range(len(self.rois_fpn))
+        ]
+
+        self.outputs = {
+            'MultiFpnRois': output,
+            'RestoreIndex': self.rois_idx_restore.reshape(-1, 1),
+            'MultiLevelRoIsNum': rois_num_per_level
+        }
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
index 466226c53fabbd315acd19c6421f210d0ca225c1..a963c2ece0958048b5f0c850184a0930022e6671 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
@@ -121,6 +121,9 @@ class TestDygraphWeightNorm(unittest.TestCase):
         before_weight = linear.weight.numpy()
         if self.dim == None:
             self.dim = -1
+
+        if self.dim != -1:
+            self.dim = (self.dim + len(before_weight)) % len(before_weight)
         wn = weight_norm(linear, dim=self.dim)
         outputs = []
         for name, data in self.data.items():
@@ -158,6 +161,13 @@ class TestDygraphWeightNormCase3(TestDygraphWeightNorm):
         self.dim = 3
 
 
+class TestDygraphWeightNormCase4(TestDygraphWeightNorm):
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.dim = -3
+
+
 class TestDygraphRemoveWeightNorm(unittest.TestCase):
     def setUp(self):
         self.init_test_case()
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
index b4359fc69ae18b45774af0d2b20c1540bd99da5c..698f914f89984d8c09619a46c6a6b292b00aac9a 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
@@ -25,6 +25,7 @@ import bisect
 import numpy as np
 
 fluid.default_startup_program().random_seed = 1
+np.random.seed(1)
 
 
 class TestDyRnnStaticInput(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
index 9ebaf8ff9438be8c8a57815be0798b861d05caaf..3cfbac8b613c125956861f73b1bab24c34e05572 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
@@ -240,124 +240,25 @@ class TestElementwiseDivBroadcast(unittest.TestCase):
             self.assertEqual((out_result == (2 / x)).all(), True)
 
 
-class TestDivideAPI(unittest.TestCase):
-    def setUp(self):
-        paddle.set_default_dtype("float64")
-        self.places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            self.places.append(fluid.CUDAPlace(0))
-
-    def check_static_result(self, place):
-        # rule 1
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = np.array([1, 2, 3])
-            self.assertRaises(TypeError, paddle.divide, x=x, y=y)
-
-        # rule 2: both the inputs are not Tensor
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = 2
-            y = 4
-            res = paddle.divide(x, y)
-            exe = fluid.Executor(place)
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={},
-                           fetch_list=[res])
-            self.assertEqual(np_z[0] == 0.5, True)
-
-        # rule 3: 
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = fluid.data(name="y", shape=[3], dtype="float32")
-            self.assertRaises(TypeError, paddle.divide, x=x, y=y)
-
-        # rule 4: x is Tensor, y is scalar
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = 2
-            exe = fluid.Executor(place)
-            res = x / y
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={"x": np.array([2, 3, 4]).astype('float64')},
-                           fetch_list=[res])
-            z_expected = np.array([1., 1.5, 2.])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
-
-        # rule 5: y is Tensor, x is scalar
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = 2
-            exe = fluid.Executor(place)
-            res = y / x
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={"x": np.array([2, 8, 4]).astype('float64')},
-                           fetch_list=[res])
-            z_expected = np.array([1., 0.25, 0.5])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
-
-        # rule 6: y is Tensor, x is Tensor
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = fluid.data(name="y", shape=[3], dtype="float64")
-            exe = fluid.Executor(place)
-            res = x / y
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={
-                               "x": np.array([2, 3, 4]).astype('float64'),
-                               "y": np.array([1, 5, 2]).astype('float64')
-                           },
-                           fetch_list=[res])
-            z_expected = np.array([2., 0.6, 2.])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
+class TestDivideOp(unittest.TestCase):
+    def test_name(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data(name="x", shape=[2, 3], dtype="float32")
+            y = fluid.data(name='y', shape=[2, 3], dtype='float32')
 
-    def test_static(self):
-        for place in self.places:
-            self.check_static_result(place=place)
+            y_1 = paddle.divide(x, y, name='div_res')
+            self.assertEqual(('div_res' in y_1.name), True)
 
     def test_dygraph(self):
-        for place in self.places:
-            with fluid.dygraph.guard(place):
-                # rule 1 : avoid numpy.ndarray
-                np_x = np.array([2, 3, 4])
-                np_y = np.array([1, 5, 2])
-                x = paddle.to_tensor(np_x)
-                self.assertRaises(TypeError, paddle.divide, x=x, y=np_y)
-
-                # rule 2: both the inputs are not Tensor
-                z = paddle.divide(3, 2)
-                self.assertEqual(z.numpy()[0] == 1.5, True)
-
-                # rule 3: both the inputs are Tensor
-                np_x = np.array([2, 3, 4])
-                np_y = np.array([1, 5, 2])
-                x = paddle.to_tensor(np_x, dtype="float32")
-                y = paddle.to_tensor(np_y, dtype="float64")
-                self.assertRaises(TypeError, paddle.divide, x=x, y=y)
-
-                # rule 4: x is Tensor, y is scalar
-                np_x = np.array([2, 3, 4])
-                x = paddle.to_tensor(np_x, dtype="int32")
-                y = 2
-                z = x / y
-                z_expected = np.array([1., 1.5, 2.])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
-
-                # rule 5: y is Tensor, x is scalar
-                np_x = np.array([2, 1, 4])
-                x = paddle.to_tensor(np_x, dtype="int32")
-                y = 2
-                z = y / x
-                z_expected = np.array([1., 2., 0.5])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
-
-                # rule 6: y is Tensor, x is Tensor
-                np_x = np.array([2, 3, 4])
-                np_y = np.array([1, 5, 2])
-                x = paddle.to_tensor(np_x)
-                y = paddle.to_tensor(np_y)
-                z = x / y
-                z_expected = np.array([2., 0.6, 2.])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
+        with fluid.dygraph.guard():
+            np_x = np.array([2, 3, 4]).astype('float64')
+            np_y = np.array([1, 5, 2]).astype('float64')
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            z = paddle.divide(x, y)
+            np_z = z.numpy()
+            z_expected = np.array([2., 0.6, 2.])
+            self.assertEqual((np_z == z_expected).all(), True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
index 0b6acc7615395ed99a484e0e56f9c62447a1f345..f339081e31b87b8d5584fd4f866e0aaf6f391ea7 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
@@ -58,13 +58,6 @@ class TestElementwiseModOp(OpTest):
         pass
 
 
-class TestElementwiseModOpInverse(TestElementwiseModOp):
-    def init_input_output(self):
-        self.x = np.random.uniform(0, 10000, [10]).astype(self.dtype)
-        self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype)
-        self.out = np.floor_divide(self.x, self.y)
-
-
 class TestElementwiseModOp_scalar(TestElementwiseModOp):
     def init_input_output(self):
         scale_x = random.randint(0, 100000000)
@@ -74,124 +67,25 @@ class TestElementwiseModOp_scalar(TestElementwiseModOp):
         self.out = np.floor_divide(self.x, self.y)
 
 
-class TestFloorDivideAPI(unittest.TestCase):
-    def setUp(self):
-        paddle.set_default_dtype("float64")
-        self.places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            self.places.append(fluid.CUDAPlace(0))
-
-    def check_static_result(self, place):
-        # rule 1
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = np.array([1, 2, 3])
-            self.assertRaises(TypeError, paddle.floor_divide, x=x, y=y)
-
-        # rule 2: both the inputs are not Tensor
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = 2
-            y = 4
-            res = paddle.floor_divide(x, y)
-            exe = fluid.Executor(place)
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={},
-                           fetch_list=[res])
-            self.assertEqual(np_z[0] == 0., True)
-
-        # rule 3: 
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = fluid.data(name="y", shape=[3], dtype="float32")
-            self.assertRaises(TypeError, paddle.floor_divide, x=x, y=y)
-
-        # rule 4: x is Tensor, y is scalar
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = 2
-            exe = fluid.Executor(place)
-            res = x // y
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={"x": np.array([2, 3, 4]).astype('float64')},
-                           fetch_list=[res])
-            z_expected = np.array([1., 1., 2.])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
-
-        # rule 5: y is Tensor, x is scalar
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = 2
-            exe = fluid.Executor(place)
-            res = y // x
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={"x": np.array([2, 8, 4]).astype('float64')},
-                           fetch_list=[res])
-            z_expected = np.array([1., 0., 0.])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
-
-        # rule 6: y is Tensor, x is Tensor
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = fluid.data(name="y", shape=[3], dtype="float64")
-            exe = fluid.Executor(place)
-            res = x // y
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={
-                               "x": np.array([2, 3, 4]).astype('float64'),
-                               "y": np.array([1, 5, 2]).astype('float64')
-                           },
-                           fetch_list=[res])
-            z_expected = np.array([2., 0., 2.])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
-
-    def test_static(self):
-        for place in self.places:
-            self.check_static_result(place=place)
+class TestFloorDivideOp(unittest.TestCase):
+    def test_name(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data(name="x", shape=[2, 3], dtype="int64")
+            y = fluid.data(name='y', shape=[2, 3], dtype='int64')
+
+            y_1 = paddle.floor_divide(x, y, name='div_res')
+            self.assertEqual(('div_res' in y_1.name), True)
 
     def test_dygraph(self):
-        for place in self.places:
-            with fluid.dygraph.guard(place):
-                # rule 1 : avoid numpy.ndarray
-                np_x = np.array([2, 3, 4])
-                np_y = np.array([1, 5, 2])
-                x = paddle.to_tensor(np_x)
-                self.assertRaises(TypeError, paddle.floor_divide, x=x, y=np_y)
-
-                # rule 2: both the inputs are not Tensor
-                z = paddle.floor_divide(3, 2)
-                self.assertEqual(z.numpy()[0] == 1., True)
-
-                # rule 3: both the inputs are Tensor
-                np_x = np.array([2, 3, 4])
-                np_y = np.array([1, 5, 2])
-                x = paddle.to_tensor(np_x, dtype="float32")
-                y = paddle.to_tensor(np_y, dtype="float64")
-                self.assertRaises(TypeError, paddle.floor_divide, x=x, y=y)
-
-                # rule 4: x is Tensor, y is scalar
-                np_x = np.array([2, 3, 4])
-                x = paddle.to_tensor(np_x, dtype="int32")
-                y = 2
-                z = x // y
-                z_expected = np.array([1, 1, 2])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
-
-                # rule 5: y is Tensor, x is scalar
-                np_x = np.array([2, 1, 4])
-                x = paddle.to_tensor(np_x, dtype="int32")
-                y = 2
-                z = y // x
-                z_expected = np.array([1, 2, 0])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
-
-                # rule 6: y is Tensor, x is Tensor
-                np_x = np.array([2, 3, 4])
-                np_y = np.array([1, 5, 2])
-                x = paddle.to_tensor(np_x)
-                y = paddle.to_tensor(np_y)
-                z = x // y
-                z_expected = np.array([2., 0., 2.])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
+        with fluid.dygraph.guard():
+            np_x = np.array([2, 3, 8, 7]).astype('int64')
+            np_y = np.array([1, 5, 3, 3]).astype('int64')
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            z = paddle.floor_divide(x, y)
+            np_z = z.numpy()
+            z_expected = np.array([2, 0, 2, 2])
+            self.assertEqual((np_z == z_expected).all(), True)
 
         with fluid.dygraph.guard(fluid.CPUPlace()):
             # divide by zero 
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
index cab6160d761004877896deea8d44ca02c9de2e1e..2a8ca51693ecfad55f2239d7619e355c6dd7f3f8 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
@@ -84,149 +84,41 @@ class TestElementwiseModOpDouble(TestElementwiseModOpFloat):
         self.dtype = np.float64
 
 
-class TestRemainderAPI(unittest.TestCase):
-    def setUp(self):
-        paddle.set_default_dtype("float64")
-        self.places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            self.places.append(fluid.CUDAPlace(0))
-
-    def check_static_result(self, place):
-        # rule 1
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = np.array([1, 2, 3])
-            self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
-
-        # rule 3: 
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = fluid.data(name="y", shape=[3], dtype="float32")
-            self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
-
-        # rule 4: x is Tensor, y is scalar
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = 2
-            exe = fluid.Executor(place)
-            res = x % y
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={"x": np.array([2, 3, 4]).astype('float64')},
-                           fetch_list=[res])
-            z_expected = np.array([0., 1., 0.])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
-
-        # rule 5: y is Tensor, x is scalar
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = 3
-            y = fluid.data(name="y", shape=[3], dtype="float32")
-            self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
-
-        # rule 6: y is Tensor, x is Tensor
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = fluid.data(name="y", shape=[1], dtype="float64")
-            exe = fluid.Executor(place)
-            res = x % y
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={
-                               "x": np.array([1., 2., 4]).astype('float64'),
-                               "y": np.array([1.5]).astype('float64')
-                           },
-                           fetch_list=[res])
-            z_expected = np.array([1., 0.5, 1.0])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
-
-        # rule 6: y is Tensor, x is Tensor
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[6], dtype="float64")
-            y = fluid.data(name="y", shape=[1], dtype="float64")
-            exe = fluid.Executor(place)
-            res = x % y
-            np_z = exe.run(
-                fluid.default_main_program(),
-                feed={
-                    "x": np.array([-3., -2, -1, 1, 2, 3]).astype('float64'),
-                    "y": np.array([2]).astype('float64')
-                },
-                fetch_list=[res])
-            z_expected = np.array([1., 0., 1., 1., 0., 1.])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
-
-    def test_static(self):
-        for place in self.places:
-            self.check_static_result(place=place)
+class TestRemainderOp(unittest.TestCase):
+    def test_name(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data(name="x", shape=[2, 3], dtype="int64")
+            y = fluid.data(name='y', shape=[2, 3], dtype='int64')
+
+            y_1 = paddle.remainder(x, y, name='div_res')
+            self.assertEqual(('div_res' in y_1.name), True)
 
     def test_dygraph(self):
-        for place in self.places:
-            with fluid.dygraph.guard(place):
-                # rule 1 : avoid numpy.ndarray
-                np_x = np.array([2, 3, 4])
-                np_y = np.array([1, 5, 2])
-                x = paddle.to_tensor(np_x)
-                self.assertRaises(TypeError, paddle.remainder, x=x, y=np_y)
-
-                # rule 3: both the inputs are Tensor
-                np_x = np.array([2, 3, 4])
-                np_y = np.array([1, 5, 2])
-                x = paddle.to_tensor(np_x, dtype="float32")
-                y = paddle.to_tensor(np_y, dtype="float64")
-                self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
-
-                # rule 4: x is Tensor, y is scalar
-                np_x = np.array([2, 3, 4])
-                x = paddle.to_tensor(np_x, dtype="int32")
-                y = 2
-                z = x % y
-                z_expected = np.array([0, 1, 0])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
-
-                # rule 5: y is Tensor, x is scalar
-                np_x = np.array([2, 3, 4])
-                x = paddle.to_tensor(np_x)
-                self.assertRaises(TypeError, paddle.remainder, x=3, y=x)
-
-                # rule 6: y is Tensor, x is Tensor
-                np_x = np.array([1., 2., 4])
-                np_y = np.array([1.5])
-                x = paddle.to_tensor(np_x)
-                y = paddle.to_tensor(np_y)
-                z = x % y
-                z_expected = np.array([1., 0.5, 1.0])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
-
-                # rule 6: y is Tensor, x is Tensor
-                np_x = np.array([-3., -2, -1, 1, 2, 3])
-                np_y = np.array([2.])
-                x = paddle.to_tensor(np_x)
-                y = paddle.to_tensor(np_y)
-                z = x % y
-                z_expected = np.array([1., 0., 1., 1., 0., 1.])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
-
-                np_x = np.array([-3.3, 11.5, -2, 3.5])
-                np_y = np.array([-1.2, 2., 3.3, -2.3])
-                x = paddle.to_tensor(np_x)
-                y = paddle.to_tensor(np_y)
-                z = x % y
-                z_expected = np.array([-0.9, 1.5, 1.3, -1.1])
-                self.assertEqual(np.allclose(z_expected, z.numpy()), True)
-
-                np_x = np.array([-3, 11, -2, 3])
-                np_y = np.array([-1, 2, 3, -2])
-                x = paddle.to_tensor(np_x, dtype="int64")
-                y = paddle.to_tensor(np_y, dtype="int64")
-                z = x % y
-                z_expected = np.array([0, 1, 1, -1])
-                self.assertEqual(np.allclose(z_expected, z.numpy()), True)
-
-                np_x = np.array([-3, 3])
-                np_y = np.array([[2, 3], [-2, -1]])
-                x = paddle.to_tensor(np_x, dtype="int64")
-                y = paddle.to_tensor(np_y, dtype="int64")
-                z = x % y
-                z_expected = np.array([[1, 0], [-1, 0]])
-                self.assertEqual(np.allclose(z_expected, z.numpy()), True)
+        with fluid.dygraph.guard():
+            np_x = np.array([2, 3, 8, 7]).astype('int64')
+            np_y = np.array([1, 5, 3, 3]).astype('int64')
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            z = paddle.remainder(x, y)
+            np_z = z.numpy()
+            z_expected = np.array([0, 3, 2, 1])
+            self.assertEqual((np_z == z_expected).all(), True)
+
+            np_x = np.array([-3.3, 11.5, -2, 3.5])
+            np_y = np.array([-1.2, 2., 3.3, -2.3])
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            z = x % y
+            z_expected = np.array([-0.9, 1.5, 1.3, -1.1])
+            self.assertEqual(np.allclose(z_expected, z.numpy()), True)
+
+            np_x = np.array([-3, 11, -2, 3])
+            np_y = np.array([-1, 2, 3, -2])
+            x = paddle.to_tensor(np_x, dtype="int64")
+            y = paddle.to_tensor(np_y, dtype="int64")
+            z = x % y
+            z_expected = np.array([0, 1, 1, -1])
+            self.assertEqual(np.allclose(z_expected, z.numpy()), True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_empty_op.py b/python/paddle/fluid/tests/unittests/test_empty_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8b1f836fcaa8d53671307d9075efd45fc88ce7b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_empty_op.py
@@ -0,0 +1,270 @@
+#Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from op_test import OpTest
+from paddle.fluid import Program, program_guard
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+
+
+# Situation 1: Attr(shape) is a list(without tensor)
+class TestEmptyOp(OpTest):
+    def setUp(self):
+        self.op_type = "empty"
+        self.init_config()
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)
+
+    def verify_output(self, outs):
+        data_type = outs[0].dtype
+        if data_type in ['float32', 'float64', 'int32', 'int64']:
+            max_value = np.nanmax(outs[0])
+            min_value = np.nanmin(outs[0])
+
+            always_full_zero = max_value == 0.0 and min_value == 0.0
+            always_non_full_zero = max_value > min_value
+            self.assertTrue(always_full_zero or always_non_full_zero,
+                            'always_full_zero or always_non_full_zero.')
+        elif data_type in ['bool']:
+            total_num = outs[0].size
+            true_num = np.sum(outs[0] == True)
+            false_num = np.sum(outs[0] == False)
+            self.assertTrue(total_num == true_num + false_num,
+                            'The value should always be True or False.')
+        else:
+            self.assertTrue(False, 'invalid data type')
+
+    def init_config(self):
+        shape = [500, 3]
+        dtype = 'float32'
+        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+        self.attrs = {'shape': shape, 'dtype': dtype_inner}
+        self.inputs = {}
+        self.outputs = {'Out': np.zeros(shape).astype(dtype)}
+
+
+class TestEmptyOp2(TestEmptyOp):
+    def init_config(self):
+        shape = [500, 3]
+        dtype = 'float64'
+        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+        self.attrs = {'shape': shape, 'dtype': dtype_inner}
+        self.inputs = {}
+        self.outputs = {'Out': np.zeros(shape).astype(dtype)}
+
+
+class TestEmptyOp3(TestEmptyOp):
+    def init_config(self):
+        shape = [500, 3]
+        dtype = 'int32'
+        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+        self.attrs = {'shape': shape, 'dtype': dtype_inner}
+        self.inputs = {}
+        self.outputs = {'Out': np.zeros(shape).astype(dtype)}
+
+
+class TestEmptyOp4(TestEmptyOp):
+    def init_config(self):
+        shape = [500, 3]
+        dtype = 'int64'
+        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+        self.attrs = {'shape': shape, 'dtype': dtype_inner}
+        self.inputs = {}
+        self.outputs = {'Out': np.zeros(shape).astype(dtype)}
+
+
+class TestEmptyOp5(TestEmptyOp):
+    def init_config(self):
+        shape = [500, 3]
+        dtype = 'bool'
+        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+        self.attrs = {'shape': shape, 'dtype': dtype_inner}
+        self.inputs = {}
+        self.outputs = {'Out': np.zeros(shape).astype(dtype)}
+
+
+# Situation 2: shape is a tensor
+class TestEmptyOp_ShapeTensor(OpTest):
+    def setUp(self):
+        self.op_type = "empty"
+        self.init_config()
+
+    def init_config(self):
+        self.shape = [500, 3]
+        dtype = 'float32'
+        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+        self.attrs = {'dtype': dtype_inner}
+        self.inputs = {"ShapeTensor": np.array(self.shape).astype("int32")}
+        self.outputs = {'Out': np.zeros(self.shape).astype(dtype)}
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)
+
+    def verify_output(self, outs):
+        data_type = outs[0].dtype
+        if data_type in ['float32', 'float64', 'int32', 'int64']:
+            max_value = np.nanmax(outs[0])
+            min_value = np.nanmin(outs[0])
+
+            always_full_zero = max_value == 0.0 and min_value == 0.0
+            always_non_full_zero = max_value > min_value
+            self.assertTrue(always_full_zero or always_non_full_zero,
+                            'always_full_zero or always_non_full_zero.')
+        elif data_type in ['bool']:
+            total_num = outs[0].size
+            true_num = np.sum(outs[0] == True)
+            false_num = np.sum(outs[0] == False)
+            self.assertTrue(total_num == true_num + false_num,
+                            'The value should always be True or False.')
+        else:
+            self.assertTrue(False, 'invalid data type')
+
+
+# Situation 3: Attr(shape) is a list(with tensor)
+class TestEmptyOp_ShapeTensorList(OpTest):
+    def setUp(self):
+        self.op_type = "empty"
+        self.init_config()
+
+    def init_config(self):
+        self.shape = [123, 92]
+        self.infer_shape = [-1, 92]
+
+        dtype = 'float32'
+        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+
+        shape_tensor_list = []
+        for index, ele in enumerate(self.shape):
+            shape_tensor_list.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs = {"ShapeTensorList": shape_tensor_list}
+        self.attrs = {'shape': self.infer_shape, 'dtype': dtype_inner}
+        self.outputs = {'Out': np.zeros(self.shape).astype(dtype)}
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)
+
+    def verify_output(self, outs):
+        data_type = outs[0].dtype
+        if data_type in ['float32', 'float64', 'int32', 'int64']:
+            max_value = np.nanmax(outs[0])
+            min_value = np.nanmin(outs[0])
+
+            always_full_zero = max_value == 0.0 and min_value == 0.0
+            always_non_full_zero = max_value > min_value
+            self.assertTrue(always_full_zero or always_non_full_zero,
+                            'always_full_zero or always_non_full_zero.')
+        elif data_type in ['bool']:
+            total_num = outs[0].size
+            true_num = np.sum(outs[0] == True)
+            false_num = np.sum(outs[0] == False)
+            self.assertTrue(total_num == true_num + false_num,
+                            'The value should always be True or False.')
+        else:
+            self.assertTrue(False, 'invalid data type')
+
+
+class TestEmptyAPI(unittest.TestCase):
+    def __check_out__(self, out, dtype='float32'):
+        max_value = np.nanmax(np.array(out))
+        min_value = np.nanmin(np.array(out))
+        always_non_full_zero = max_value > min_value
+        always_full_zero = max_value == 0.0 and min_value == 0.0
+        self.assertTrue(always_full_zero or always_non_full_zero,
+                        'always_full_zero or always_non_full_zero.')
+
+    def test_dygraph_api_out(self):
+        paddle.disable_static()
+        shape = [200, 3]
+        out = paddle.empty(shape=shape)
+        self.__check_out__(out)
+        paddle.enable_static()
+
+    def test_dygraph_api_out_2(self):
+        paddle.disable_static()
+        shape_data = np.array([200, 3]).astype('int32')
+        shape = paddle.to_tensor(shape_data)
+        out = paddle.empty(shape=shape)
+        self.__check_out__(out)
+        paddle.enable_static()
+
+    def test_dygraph_api_out_3(self):
+        paddle.disable_static()
+        shape_data = np.array([200, 3]).astype('int64')
+        shape = paddle.to_tensor(shape_data)
+        out = paddle.empty(shape=shape)
+        self.__check_out__(out)
+        paddle.enable_static()
+
+    def test_dygraph_api_attr(self):
+        paddle.disable_static()
+        shape = [200, 3]
+        dtype = 'float64'
+        out = paddle.empty(shape=shape, dtype=dtype)
+        self.__check_out__(out, dtype)
+        paddle.enable_static()
+
+    def test_static_graph(self):
+        dtype = 'float64'
+
+        positive_2_int32 = fluid.layers.fill_constant([1], "int32", 3)
+        positive_2_int64 = fluid.layers.fill_constant([1], "int64", 3)
+
+        shape_tensor_int32 = fluid.data(
+            name="shape_tensor_int32", shape=[2], dtype="int32")
+        shape_tensor_int64 = fluid.data(
+            name="shape_tensor_int64", shape=[2], dtype="int64")
+
+        out_1 = paddle.empty(shape=[200, 3], dtype=dtype)
+        out_2 = paddle.empty(shape=shape_tensor_int32, dtype=dtype)
+        out_3 = paddle.empty(shape=shape_tensor_int64, dtype=dtype)
+        out_4 = paddle.empty(shape=[200, positive_2_int32], dtype=dtype)
+        out_5 = paddle.empty(shape=[200, positive_2_int64], dtype=dtype)
+
+        place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        res_1, res_2, res_3, res_4, res_5 = exe.run(
+            fluid.default_main_program(),
+            feed={
+                "shape_tensor_int32": np.array([200, 3]).astype("int32"),
+                "shape_tensor_int64": np.array([200, 3]).astype("int64"),
+            },
+            fetch_list=[out_1, out_2, out_3, out_4, out_5])
+
+        self.__check_out__(res_1, dtype)
+        self.__check_out__(res_2, dtype)
+        self.__check_out__(res_3, dtype)
+        self.__check_out__(res_4, dtype)
+        self.__check_out__(res_5, dtype)
+
+
+class TestEmptyError(unittest.TestCase):
+    def test_attr(self):
+        def test_dtype():
+            shape = [200, 3]
+            dtype = 'uint8'
+            result = paddle.empty(shape=shape, dtype=dtype)
+
+        self.assertRaises(TypeError, test_dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py b/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py
index 1181272bd98b00f65e6925b44da814662f96045f..37d269e3369bfe7db00529dea5e08b287151691a 100644
--- a/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py
+++ b/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py
@@ -28,7 +28,7 @@ class TestFetchUnmerged(unittest.TestCase):
         conv_pool_1 = fluid.nets.simple_img_conv_pool(
             input=img,
             filter_size=5,
-            num_filters=20,
+            num_filters=8,
             pool_size=2,
             pool_stride=2,
             pool_type='max',
@@ -37,12 +37,12 @@ class TestFetchUnmerged(unittest.TestCase):
         conv_pool_2 = fluid.nets.simple_img_conv_pool(
             input=conv_pool_1,
             filter_size=5,
-            num_filters=50,
+            num_filters=16,
             pool_size=2,
             pool_stride=2,
             pool_type='avg',
             act="relu")
-        hidden = fluid.layers.fc(input=conv_pool_2, size=100, act='relu')
+        hidden = fluid.layers.fc(input=conv_pool_2, size=32, act='relu')
         prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
         loss = fluid.layers.cross_entropy(input=prediction, label=label)
         avg_loss = fluid.layers.mean(loss)
@@ -75,8 +75,8 @@ class TestFetchUnmerged(unittest.TestCase):
         binary = fluid.CompiledProgram(main_program).with_data_parallel(
             loss_name=loss.name, build_strategy=build_strategy)
 
-        iters = 3
-        batch_size = 64
+        iters = 2
+        batch_size = 16
         train_reader = paddle.batch(
             paddle.reader.shuffle(
                 paddle.dataset.mnist.train(), buf_size=500),
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
index 38c3903306e6e76188cdb50476d6797814c434e9..73e014b35008ff5a0539c6a338755b9dc2cf68d4 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
@@ -57,7 +57,7 @@ class TestFleetAMPOptimizer(unittest.TestCase):
 
         ops = [op.type for op in avg_cost.block.ops]
         self.assertIn('cast', ops)
-        self.assertIn('isfinite', ops)
+        self.assertIn('check_finite_and_unscale', ops)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_auto.py b/python/paddle/fluid/tests/unittests/test_fleet_auto.py
new file mode 100644
index 0000000000000000000000000000000000000000..020f2f4db382ef1277167d85917e8fdba9c83893
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_auto.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+
+
+class TestDistributedStrategyAuto(unittest.TestCase):
+    def setUp(self):
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+                       "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_distributed_strategy_auto(self):
+        fleet.init(is_collective=True)
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.auto = True
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
index 8d715674cc6c9ba4f8b5c1ff4fe0cbdbe7841643..6f8af3017efcb9010b129131a01c5ee071b5bc36 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -81,9 +81,10 @@ class TestStrategyConfig(unittest.TestCase):
 
     def test_localsgd_configs(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
-        configs = {"k_steps": 4}
+        configs = {"k_steps": 4, "begin_step": 120}
         strategy.localsgd_configs = configs
         self.assertEqual(strategy.localsgd_configs["k_steps"], 4)
+        self.assertEqual(strategy.localsgd_configs["begin_step"], 120)
 
     def test_dgc(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
@@ -230,7 +231,7 @@ class TestStrategyConfig(unittest.TestCase):
         strategy.a_sync = True
         strategy.localsgd = True
         strategy.dgc = True
-        localsgd_configs = {"k_steps": 5}
+        localsgd_configs = {"k_steps": 5, "begin_step": 1}
         strategy.localsgd_configs = localsgd_configs
         build_strategy = paddle.fluid.BuildStrategy()
         build_strategy.enable_sequential_execution = True
@@ -316,6 +317,14 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.conv_workspace_size_limit, 1000)
         strategy._enable_env()
 
+    def test_distributed_strategy_repr(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.recompute = True
+        strategy.recompute_configs = {"checkpoints": ["a1", "a2", "a3"]}
+        strategy.amp = True
+        strategy.localsgd = True
+        print(str(strategy))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
index 3f140f53b043b1949572f3728ca8a0c556317783..ec055178d90c529080489218f3aca1a71311beea 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
@@ -22,11 +22,9 @@ import paddle.distributed.fleet.base.role_maker as role_maker
 
 class TestFleetLambMetaOptimizer(unittest.TestCase):
     def setUp(self):
-        os.environ["POD_IP"] = "127.0.0.1"
-        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
-        os.environ["PADDLE_TRAINERS_NUM"] = "2"
-        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
-                       "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ["PADDLE_TRAINER_ID"] = "1"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
 
     def net(self, main_prog, startup_prog):
         with fluid.program_guard(main_prog, startup_prog):
@@ -97,13 +95,54 @@ class TestFleetLambMetaOptimizer(unittest.TestCase):
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
-        ops_with_bias = [
+        ops_without_wd = [
             op for op in avg_cost.block.ops
             if op.type == 'lamb' and op.attr('op_role_var')[0].endswith('.b_0')
         ]
-        for op in ops_with_bias:
+        for op in ops_without_wd:
             self.assertEqual(op.attr('weight_decay'), 0)
 
+    def test_lamb_apply_with_amp(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.amp = True
+        strategy.amp_configs = {
+            "init_loss_scaling": 32768,
+            "decr_every_n_nan_or_inf": 2,
+            "incr_every_n_steps": 1000,
+            "incr_ratio": 2.0,
+            "use_dynamic_loss_scaling": True,
+            "decr_ratio": 0.5,
+            "custom_white_list": ['softmax'],
+            "custom_black_list": ['tanh'],
+        }
+        strategy.lamb = True
+        strategy.lamb_configs = {
+            'lamb_weight_decay': 0.01,
+            'exclude_from_weight_decay': [],
+        }
+
+        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('lamb', ops)
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
index 3caa1a4eac0bf191b13e6708b1a9adffdb111ca7..0a70710b4590e253463640634615c2d11ff36e9f 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
@@ -22,11 +22,9 @@ import paddle.distributed.fleet.base.role_maker as role_maker
 
 class TestFleetLarsMetaOptimizer(unittest.TestCase):
     def setUp(self):
-        os.environ["POD_IP"] = "127.0.0.1"
-        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
-        os.environ["PADDLE_TRAINERS_NUM"] = "2"
-        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
-                       "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ["PADDLE_TRAINER_ID"] = "1"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
 
     def net(self, main_prog, startup_prog):
         with fluid.program_guard(main_prog, startup_prog):
@@ -52,6 +50,8 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
                 strategy.lars_configs = {
                     "lars_coeff": 0.001,
                     "lars_weight_decay": 0.0005,
+                    "epsilon": 0,
+                    "exclude_from_weight_decay": ["batch_norm", ".b"],
                 }
 
         return avg_cost, strategy
@@ -83,6 +83,70 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
         ops = [op.type for op in avg_cost.block.ops]
         self.assertNotIn('lars_momentum', ops)
 
+    def test_lars_exclude_fn(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        startup_prog = fluid.Program()
+        train_prog = fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        optimizer = paddle.fluid.optimizer.Momentum(
+            learning_rate=0.01, momentum=0.9)
+
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        ops_without_wd = [
+            op for op in avg_cost.block.ops
+            if op.type == 'lars_momentum' and ("batch_norm" in op.attr(
+                'op_role_var')[0] or ".b" in op.attr('op_role_var')[0])
+        ]
+        for op in ops_without_wd:
+            self.assertEqual(op.attr('lars_weight_decay'), 0)
+
+    def test_lars_apply_with_amp(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.amp = True
+        strategy.amp_configs = {
+            "init_loss_scaling": 32768,
+            "decr_every_n_nan_or_inf": 2,
+            "incr_every_n_steps": 1000,
+            "incr_ratio": 2.0,
+            "use_dynamic_loss_scaling": True,
+            "decr_ratio": 0.5,
+            "custom_white_list": ['softmax'],
+            "custom_black_list": ['tanh'],
+        }
+        strategy.lars = True
+        strategy.lars_configs = {
+            "lars_coeff": 0.001,
+            "lars_weight_decay": 0.0005,
+            "epsilon": 0,
+            "exclude_from_weight_decay": ["batch_norm", ".b"],
+        }
+
+        optimizer = paddle.fluid.optimizer.Momentum(
+            learning_rate=0.01, momentum=0.9)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('lars_momentum', ops)
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
index c5edc96963408bf1fad793f7271d75159934f019..e717962ead2e2da30092b12379bf36f368e8a735 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
@@ -79,9 +79,9 @@ if [ -f $file_1 ]; then
     rm $file_1
 fi
 
-
+# test use DISTRIBUTED_TRAINER_ENDPOINTS env in paddlecloud
 unset PADDLE_PORT
-unset TRAINER_PORTS_NUM
+export DISTRIBUTED_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171
 
 echo ""
 echo "paddle.distributed.launch async poll process test"
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
index 07b988bf8752057e68925bc42f564a72d466361d..945f5ae57454b2c4a509badb93574a6e03b607e8 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
@@ -44,6 +44,7 @@ class TestFleetLocalSGDMetaOptimizer(unittest.TestCase):
         strategy.auto = True
         config = strategy.localsgd_configs
         config['k_steps'] = 1
+        config['begin_step'] = 1
         strategy.localsgd_configs = config
 
         optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
index eb5d9eb66608dd397dad773158c337fc67be2dbb..a831f6e838e950f9955c762544c312ed2d8766a9 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
@@ -163,10 +163,9 @@ class TestCloudRoleMaker2(unittest.TestCase):
             data = "1 1 1 1\n"
             f.write(data)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
+        dataset = paddle.distributed.InMemoryDataset()
         dataset.set_filelist(["test_fleet_gloo_role_maker_1.txt"])
-        dataset.set_use_var([show, label])
+        dataset._set_use_var([show, label])
         dataset.load_into_memory()
         dataset.get_memory_data_size(fleet)
         dataset.get_shuffle_data_size(fleet)
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
index 921dbdbc6d4e1b169c2c8aa199ea15f886bd0128..5bcfc8720ddd2a8b495c50f886c03047c9abdb32 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
@@ -25,7 +25,7 @@ class TestFuseBatchNormActPass(unittest.TestCase):
             hidden1 = fluid.layers.conv2d(
                 input=x,
                 filter_size=3,
-                num_filters=32,
+                num_filters=16,
                 stride=1,
                 padding=1,
                 act=None,
@@ -43,7 +43,7 @@ class TestFuseBatchNormActPass(unittest.TestCase):
                 bias_attr=bias_attr,
                 act='relu',
                 data_layout='NHWC')
-            hidden3 = fluid.layers.fc(input=hidden2, size=128, act='relu')
+            hidden3 = fluid.layers.fc(input=hidden2, size=32, act='relu')
             hidden4 = fluid.layers.batch_norm(
                 input=hidden3, act='relu', data_layout='NHWC')
             prediction = fluid.layers.fc(input=hidden4, size=10, act='softmax')
@@ -63,7 +63,7 @@ class TestFuseBatchNormActPass(unittest.TestCase):
         startup_program = fluid.Program()
         x, y, loss = self.build_program(main_program, startup_program, use_cuda)
         exe = fluid.Executor(place)
-        iters = 10
+        iters = 8
         batch_size = 16
         feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
 
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
index ce561cd317c48228c4877a2b65b67fe049a0d84a..26fc01ca04506758599ac5d6fe6842984a8d7a9c 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
@@ -34,18 +34,18 @@ def generate_proposals_in_python(scores, bbox_deltas, im_info, anchors,
 
     rpn_rois = []
     rpn_roi_probs = []
-    lod = []
+    rois_num = []
     num_images = scores.shape[0]
     for img_idx in range(num_images):
         img_i_boxes, img_i_probs = proposal_for_one_image(
             im_info[img_idx, :], all_anchors, variances,
             bbox_deltas[img_idx, :, :, :], scores[img_idx, :, :, :],
             pre_nms_topN, post_nms_topN, nms_thresh, min_size, eta)
-        lod.append(img_i_probs.shape[0])
+        rois_num.append(img_i_probs.shape[0])
         rpn_rois.append(img_i_boxes)
         rpn_roi_probs.append(img_i_probs)
 
-    return rpn_rois, rpn_roi_probs, lod
+    return rpn_rois, rpn_roi_probs, rois_num
 
 
 def proposal_for_one_image(im_info, all_anchors, variances, bbox_deltas, scores,
@@ -87,6 +87,10 @@ def proposal_for_one_image(im_info, all_anchors, variances, bbox_deltas, scores,
     proposals = clip_tiled_boxes(proposals, im_info[:2])
     # remove predicted boxes with height or width < min_size
     keep = filter_boxes(proposals, min_size, im_info)
+    if len(keep) == 0:
+        proposals = np.zeros((1, 4)).astype('float32')
+        scores = np.zeros((1, 1)).astype('float32')
+        return proposals, scores
     proposals = proposals[keep, :]
     scores = scores[keep, :]
 
@@ -280,8 +284,8 @@ class TestGenerateProposalsOp(OpTest):
         }
 
         self.outputs = {
-            'RpnRois': (self.rpn_rois[0], [self.lod]),
-            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.lod]),
+            'RpnRois': (self.rpn_rois[0], [self.rois_num]),
+            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.rois_num]),
         }
 
     def test_check_output(self):
@@ -320,7 +324,7 @@ class TestGenerateProposalsOp(OpTest):
             (batch_size, num_anchors * 4, layer_h, layer_w)).astype('float32')
 
     def init_test_output(self):
-        self.rpn_rois, self.rpn_roi_probs, self.lod = generate_proposals_in_python(
+        self.rpn_rois, self.rpn_roi_probs, self.rois_num = generate_proposals_in_python(
             self.scores, self.bbox_deltas, self.im_info, self.anchors,
             self.variances, self.pre_nms_topN, self.post_nms_topN,
             self.nms_thresh, self.min_size, self.eta)
@@ -349,12 +353,21 @@ class TestGenerateProposalsOutLodOp(TestGenerateProposalsOp):
         }
 
         self.outputs = {
-            'RpnRois': (self.rpn_rois[0], [self.lod]),
-            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.lod]),
-            'RpnRoisLod': (np.asarray(
-                self.lod, dtype=np.int32))
+            'RpnRois': (self.rpn_rois[0], [self.rois_num]),
+            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.rois_num]),
+            'RpnRoisNum': (np.asarray(
+                self.rois_num, dtype=np.int32))
         }
 
 
+class TestGenerateProposalsOpNoBoxLeft(TestGenerateProposalsOp):
+    def init_test_params(self):
+        self.pre_nms_topN = 12000  # train 12000, test 2000
+        self.post_nms_topN = 5000  # train 6000, test 1000
+        self.nms_thresh = 0.7
+        self.min_size = 1000.0
+        self.eta = 1.
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_grid_sample_function.py b/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
index 4a33f32a0b6977716d8065419f8e0f88d6c4f44a..ea94a8ba69a784efb1a2a12f6f251316553cab50 100644
--- a/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
@@ -100,7 +100,7 @@ def add_cases(suite):
         GridSampleTestCase(
             methodName='runTest',
             mode='bilinear',
-            padding_mode='reflect',
+            padding_mode='reflection',
             align_corners=True))
     suite.addTest(
         GridSampleTestCase(
diff --git a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
index 4d1ed5aeb96ebbe064e35c1bee9d5775812440f7..bf2f9518fb0c720556b7eecdf5b286dea0fff96c 100644
--- a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
@@ -73,7 +73,7 @@ def unnormalizeAndClip(grid_slice, max_val, align_corners, padding_mode):
 
     if padding_mode == "border":
         grid_slice = clip(grid_slice, 0, max_val)
-    elif padding_mode == "reflect":
+    elif padding_mode == "reflection":
         double_range = 2 * max_val if align_corners else (max_val + 1) * 2
         grid_abs = np.abs(grid_slice) if align_corners else np.abs(grid_slice +
                                                                    0.5)
@@ -211,7 +211,7 @@ class Case2(TestGridSamplerOp):
         self.grid_shape = (2, 8, 9, 2)
         self.theta_shape = (2, 2, 3)
         self.align_corners = False
-        self.padding_mode = "reflect"
+        self.padding_mode = "reflection"
         self.mode = "bilinear"
 
 
@@ -221,7 +221,7 @@ class Case3(TestGridSamplerOp):
         self.grid_shape = (2, 8, 9, 2)
         self.theta_shape = (2, 2, 3)
         self.align_corners = True
-        self.padding_mode = "reflect"
+        self.padding_mode = "reflection"
         self.mode = "bilinear"
 
 
@@ -231,7 +231,7 @@ class Case4(TestGridSamplerOp):
         self.grid_shape = (2, 8, 9, 2)
         self.theta_shape = (2, 2, 3)
         self.align_corners = False
-        self.padding_mode = "reflect"
+        self.padding_mode = "reflection"
         self.mode = "nearest"
         self.numeric_grad_delta = 0.0001
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py
index 7fb2cb0090da57ae837d1f774518dd90a41df56c..9b2d71c9f907779bc9b27b51e21056496f8d4dd5 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py
@@ -18,6 +18,7 @@ import multiprocessing
 import numpy as np
 import paddle.fluid as fluid
 from paddle.fluid import core
+from paddle.fluid.reader import _reader_process_loop
 
 if sys.version_info[0] == 2:
     import Queue as queue
@@ -66,7 +67,7 @@ class TestDygraphDataLoaderProcess(unittest.TestCase):
                 batch_generator_creator(self.batch_size, self.batch_num),
                 places=fluid.CPUPlace())
             loader._data_queue = queue.Queue(self.batch_num + 1)
-            loader._reader_process_loop()
+            _reader_process_loop(loader._batch_reader, loader._data_queue)
             # For clean memory mapped files
             util_queue = multiprocessing.Queue(self.batch_num + 1)
             for _ in range(self.batch_num):
@@ -94,7 +95,7 @@ class TestDygraphDataLoaderProcess(unittest.TestCase):
             loader._data_queue = queue.Queue(self.batch_num + 1)
             exception = None
             try:
-                loader._reader_process_loop()
+                _reader_process_loop(loader._batch_reader, loader._data_queue)
             except core.EnforceNotMet as ex:
                 exception = ex
             self.assertIsNotNone(exception)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py b/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
index da01be8159a5c5d277a22134eb60ef37ef85fc4f..772dd913e4d20ccf51601ea620822c250cb45320 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
@@ -15,18 +15,26 @@
 import unittest
 import numpy as np
 import paddle.fluid as fluid
+import warnings
 
 
 class TestImperativeNumpyBridge(unittest.TestCase):
     def test_tensor_from_numpy(self):
         data_np = np.array([[2, 3, 1]]).astype('float32')
         with fluid.dygraph.guard(fluid.CPUPlace()):
-            var = fluid.dygraph.to_variable(data_np, zero_copy=True)
-            self.assertTrue(np.array_equal(var.numpy(), data_np))
-            data_np[0][0] = 4
-            self.assertEqual(data_np[0][0], 4)
-            self.assertEqual(var[0][0].numpy()[0], 4)
-            self.assertTrue(np.array_equal(var.numpy(), data_np))
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter("always")
+                var = fluid.dygraph.to_variable(data_np, zero_copy=True)
+                assert "Currently, zero_copy is not supported, and it will be discarded." in str(
+                    w[-1].message)
+            # Temporally diable zero_copy
+            # var = fluid.dygraph.to_variable(data_np, zero_copy=True)
+            # self.assertTrue(np.array_equal(var.numpy(), data_np))
+            # data_np[0][0] = 4
+            # self.assertEqual(data_np[0][0], 4)
+            # self.assertEqual(var[0][0].numpy()[0], 4)
+            # self.assertTrue(np.array_equal(var.numpy(), data_np))
+
             var2 = fluid.dygraph.to_variable(data_np, zero_copy=False)
             self.assertTrue(np.array_equal(var2.numpy(), data_np))
             data_np[0][0] = -1
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index f7fcc1ff561b90dc1b78a67ffbe7c047ed06d0e9..7e6ca8076de5186def1229b58bd23df73021430e 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -23,7 +23,7 @@ from paddle.static import InputSpec
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import Linear
 from paddle.fluid.dygraph import declarative, ProgramTranslator
-from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME
+from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME
 
 BATCH_SIZE = 32
 BATCH_NUM = 10
@@ -56,6 +56,16 @@ class LinearNet(fluid.dygraph.Layer):
         return self._linear(x)
 
 
+class LinearNetWithInputSpec(fluid.dygraph.Layer):
+    def __init__(self, in_size, out_size):
+        super(LinearNetWithInputSpec, self).__init__()
+        self._linear = Linear(in_size, out_size)
+
+    @declarative(input_spec=[InputSpec(shape=[None, 784], dtype='float32')])
+    def forward(self, x):
+        return self._linear(x)
+
+
 class LinearNetNotDeclarative(fluid.dygraph.Layer):
     def __init__(self, in_size, out_size):
         super(LinearNetNotDeclarative, self).__init__()
@@ -65,6 +75,23 @@ class LinearNetNotDeclarative(fluid.dygraph.Layer):
         return self._linear(x)
 
 
+class LinerNetWithLabel(paddle.nn.Layer):
+    def __init__(self, in_size, out_size):
+        super(LinerNetWithLabel, self).__init__()
+        self._linear = Linear(in_size, out_size)
+
+    @declarative(input_spec=[
+        InputSpec(
+            shape=[None, 784], dtype='float32', name="image"), InputSpec(
+                shape=[None, 1], dtype='int64', name="label")
+    ])
+    def forward(self, x, label):
+        out = self._linear(x)
+        loss = fluid.layers.cross_entropy(out, label)
+        avg_loss = fluid.layers.mean(loss)
+        return out, avg_loss
+
+
 class LinearNetReturnLoss(fluid.dygraph.Layer):
     def __init__(self, in_size, out_size):
         super(LinearNetReturnLoss, self).__init__()
@@ -78,6 +105,72 @@ class LinearNetReturnLoss(fluid.dygraph.Layer):
         return z, loss
 
 
+class LinearNetMultiInput(fluid.dygraph.Layer):
+    def __init__(self, in_size, out_size):
+        super(LinearNetMultiInput, self).__init__()
+        self._linear1 = Linear(in_size, out_size)
+        self._linear2 = Linear(in_size, out_size)
+
+    @declarative(input_spec=[
+        InputSpec(
+            [None, 8], dtype='float32'), InputSpec(
+                [None, 8], dtype='float32')
+    ])
+    def forward(self, x, y):
+        x_out = self._linear1(x)
+        y_out = self._linear2(y)
+        loss = fluid.layers.mean(x_out + y_out)
+        return x_out, y_out, loss
+
+
+class MultiLoadingLinearNet(fluid.dygraph.Layer):
+    def __init__(self, size, model_path):
+        super(MultiLoadingLinearNet, self).__init__()
+        self._linear = Linear(size, size)
+        self._load_linear1 = fluid.dygraph.jit.load(model_path)
+        self._load_linear2 = fluid.dygraph.jit.load(model_path)
+
+    @declarative
+    def forward(self, x):
+        tmp1 = self._linear(x)
+        tmp2 = self._load_linear1(tmp1)
+        tmp3 = self._load_linear2(tmp2)
+        y = self._linear(tmp3)
+        return y
+
+
+class LinearNetReturnHidden(fluid.dygraph.Layer):
+    def __init__(self, in_size, out_size):
+        super(LinearNetReturnHidden, self).__init__()
+        self._linear_1 = Linear(in_size, out_size)
+        self._linear_2 = Linear(in_size, out_size)
+
+    @declarative
+    def forward(self, x):
+        y = self._linear_1(x)
+        z = self._linear_2(y)
+        loss = fluid.layers.mean(z)
+        return y, loss
+
+
+class EmptyLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(EmptyLayer, self).__init__()
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        return x
+
+
+class NoParamLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(NoParamLayer, self).__init__()
+
+    @paddle.jit.to_static
+    def forward(self, x, y):
+        return x + y
+
+
 def train(layer, input_size=784, label_size=1):
     # create optimizer
     sgd = fluid.optimizer.SGDOptimizer(
@@ -102,6 +195,27 @@ def train(layer, input_size=784, label_size=1):
     return [img], layer, avg_loss
 
 
+def train_with_label(layer, input_size=784, label_size=1):
+    # create optimizer
+    sgd = fluid.optimizer.SGDOptimizer(
+        learning_rate=0.01, parameter_list=layer.parameters())
+    # create data loader
+    train_loader = fluid.io.DataLoader.from_generator(capacity=5)
+    train_loader.set_batch_generator(
+        random_batch_reader(input_size, label_size))
+    # train
+    for data in train_loader():
+        img, label = data
+        label.stop_gradient = True
+
+        out, avg_loss = layer(img, label)
+
+        avg_loss.backward()
+        sgd.minimize(avg_loss)
+        layer.clear_gradients()
+    return out
+
+
 class TestJitSaveLoad(unittest.TestCase):
     def setUp(self):
         self.model_path = "model.test_jit_save_load"
@@ -159,8 +273,11 @@ class TestJitSaveLoad(unittest.TestCase):
         train_layer.eval()
         # construct new model
         new_layer = LinearNet(784, 1)
-        model_dict, _ = fluid.dygraph.load_dygraph(self.model_path)
-        new_layer.set_dict(model_dict)
+        orig_state_dict = new_layer.state_dict()
+        load_state_dict, _ = fluid.dygraph.load_dygraph(self.model_path)
+        for structured_name in orig_state_dict:
+            self.assertTrue(structured_name in load_state_dict)
+        new_layer.set_state_dict(load_state_dict)
         new_layer.eval()
         # inference & compare
         x = fluid.dygraph.to_variable(
@@ -168,38 +285,20 @@ class TestJitSaveLoad(unittest.TestCase):
         self.assertTrue(
             np.array_equal(train_layer(x).numpy(), new_layer(x).numpy()))
 
-    def test_save_get_program_failed(self):
-        layer = LinearNetNotDeclarative(784, 1)
-        example_inputs, layer, _ = train(layer)
-        with self.assertRaises(RuntimeError):
-            fluid.dygraph.jit.save(
-                layer=layer,
-                model_path=self.model_path,
-                input_spec=example_inputs)
-
     def test_load_dygraph_no_path(self):
         model_path = "model.test_jit_save_load.no_path"
         new_layer = LinearNet(784, 1)
         with self.assertRaises(ValueError):
             model_dict, _ = fluid.dygraph.load_dygraph(model_path)
 
-
-class LinearNetMultiInput(fluid.dygraph.Layer):
-    def __init__(self, in_size, out_size):
-        super(LinearNetMultiInput, self).__init__()
-        self._linear1 = Linear(in_size, out_size)
-        # self._linear2 = Linear(in_size, out_size)
-
-    @declarative(input_spec=[
-        InputSpec(
-            [None, 8], dtype='float32'), InputSpec(
-                [None, 8], dtype='float32')
-    ])
-    def forward(self, x, y):
-        x_out = self._linear1(x)
-        y_out = self._linear1(y)
-        loss = fluid.layers.mean(x_out + y_out)
-        return x_out, y_out, loss
+    def test_jit_load_model_incomplete(self):
+        model_path = "model.test_jit_save_load.remove_variables"
+        self.train_and_save_model(model_path=model_path)
+        # remove `__variables__`	
+        var_path = os.path.join(model_path, VARIABLE_FILENAME)
+        os.remove(var_path)
+        with self.assertRaises(ValueError):
+            paddle.jit.load(model_path)
 
 
 class TestSaveLoadWithInputSpec(unittest.TestCase):
@@ -345,22 +444,6 @@ class TestJitSaveLoadConfig(unittest.TestCase):
             np.array_equal(train_layer(x)[0].numpy(), infer_layer(x).numpy()))
 
 
-class MultiLoadingLinearNet(fluid.dygraph.Layer):
-    def __init__(self, size, model_path):
-        super(MultiLoadingLinearNet, self).__init__()
-        self._linear = Linear(size, size)
-        self._load_linear1 = fluid.dygraph.jit.load(model_path)
-        self._load_linear2 = fluid.dygraph.jit.load(model_path)
-
-    @declarative
-    def forward(self, x):
-        tmp1 = self._linear(x)
-        tmp2 = self._load_linear1(tmp1)
-        tmp3 = self._load_linear2(tmp2)
-        y = self._linear(tmp3)
-        return y
-
-
 class TestJitMultipleLoading(unittest.TestCase):
     def setUp(self):
         self.linear_size = 4
@@ -389,20 +472,6 @@ class TestJitMultipleLoading(unittest.TestCase):
             name_set.add(var.name)
 
 
-class LinearNetReturnHidden(fluid.dygraph.Layer):
-    def __init__(self, in_size, out_size):
-        super(LinearNetReturnHidden, self).__init__()
-        self._linear_1 = Linear(in_size, out_size)
-        self._linear_2 = Linear(in_size, out_size)
-
-    @declarative
-    def forward(self, x):
-        y = self._linear_1(x)
-        z = self._linear_2(y)
-        loss = fluid.layers.mean(z)
-        return y, loss
-
-
 class TestJitPruneModelAndLoad(unittest.TestCase):
     def setUp(self):
         self.linear_size = 4
@@ -461,5 +530,230 @@ class TestJitPruneModelAndLoad(unittest.TestCase):
             fluid.dygraph.jit.load(self.model_path)
 
 
+class TestJitSaveMultiCases(unittest.TestCase):
+    def setUp(self):
+        # enable dygraph mode
+        fluid.enable_dygraph()
+        # config seed
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
+
+    def verify_inference_correctness(self, layer, model_path, with_label=False):
+        layer.eval()
+        loaded_layer = paddle.jit.load(model_path)
+        loaded_layer.eval()
+        # inference & compare
+        x = paddle.to_variable(np.random.random((1, 784)).astype('float32'))
+        if with_label:
+            y = paddle.to_variable(np.random.random((1, 1)).astype('int64'))
+            pred, _ = layer(x, y)
+            pred = pred.numpy()
+        else:
+            pred = layer(x).numpy()
+        loaded_pred = loaded_layer(x).numpy()
+        self.assertTrue(
+            np.array_equal(pred, loaded_pred),
+            msg="Result diff when load and inference:\nlayer result:\n{}\n" \
+                "loaded layer result:\n{}".format(pred, loaded_pred))
+
+    def test_no_prune_to_static_after_train(self):
+        layer = LinearNet(784, 1)
+
+        train(layer)
+
+        model_path = "test_no_prune_to_static_after_train"
+        paddle.jit.save(layer, model_path)
+
+        self.verify_inference_correctness(layer, model_path)
+
+    def test_no_prune_to_static_no_train(self):
+        layer = LinearNetWithInputSpec(784, 1)
+
+        model_path = "test_no_prune_to_static_no_train"
+        paddle.jit.save(layer, model_path)
+
+        self.verify_inference_correctness(layer, model_path)
+
+    def test_no_prune_no_to_static_after_train(self):
+        layer = LinearNetNotDeclarative(784, 1)
+
+        train(layer)
+
+        model_path = "test_no_prune_no_to_static_after_train"
+        paddle.jit.save(
+            layer,
+            model_path,
+            input_spec=[InputSpec(
+                shape=[None, 784], dtype='float32')])
+
+        self.verify_inference_correctness(layer, model_path)
+
+    def test_no_prune_no_to_static_after_train_with_examples(self):
+        layer = LinearNetNotDeclarative(784, 1)
+
+        example_inputs, _, _ = train(layer)
+
+        model_path = "test_no_prune_no_to_static_after_train_with_examples"
+        fluid.dygraph.jit.save(
+            layer=layer, model_path=model_path, input_spec=example_inputs)
+
+        self.verify_inference_correctness(layer, model_path)
+
+    def test_no_prune_no_to_static_no_train(self):
+        layer = LinearNetNotDeclarative(784, 1)
+
+        model_path = "test_no_prune_no_to_static_no_train"
+        paddle.jit.save(
+            layer,
+            model_path,
+            input_spec=[InputSpec(
+                shape=[None, 784], dtype='float32')])
+
+        self.verify_inference_correctness(layer, model_path)
+
+    def test_prune_to_static_after_train(self):
+        layer = LinerNetWithLabel(784, 1)
+
+        out = train_with_label(layer)
+
+        model_path = "test_prune_to_static_after_train"
+        configs = paddle.SaveLoadConfig()
+        configs.output_spec = [out]
+        paddle.jit.save(
+            layer,
+            model_path,
+            input_spec=[
+                InputSpec(
+                    shape=[None, 784], dtype='float32', name="image")
+            ],
+            configs=configs)
+
+        self.verify_inference_correctness(layer, model_path, True)
+
+    def test_prune_to_static_no_train(self):
+        layer = LinerNetWithLabel(784, 1)
+
+        model_path = "test_prune_to_static_no_train"
+        configs = paddle.SaveLoadConfig()
+        # TODO: no train, cannot get output_spec var here
+        # now only can use index
+        configs.output_spec = layer.forward.outputs[:1]
+        paddle.jit.save(
+            layer,
+            model_path,
+            input_spec=[
+                InputSpec(
+                    shape=[None, 784], dtype='float32', name="image")
+            ],
+            configs=configs)
+
+        self.verify_inference_correctness(layer, model_path, True)
+
+    def test_no_prune_input_spec_name_warning(self):
+        layer = LinearNetWithInputSpec(784, 1)
+
+        train(layer)
+
+        model_path = "test_no_prune_input_spec_name_warning"
+        paddle.jit.save(
+            layer,
+            model_path,
+            input_spec=[InputSpec(
+                shape=[None, 784], dtype='float32')])
+        paddle.jit.save(
+            layer,
+            model_path,
+            input_spec=[
+                InputSpec(
+                    shape=[None, 784], dtype='float32', name='feed_input')
+            ])
+
+        self.verify_inference_correctness(layer, model_path)
+
+    def test_not_prune_output_spec_name_warning(self):
+        layer = LinearNet(784, 1)
+
+        train(layer)
+
+        model_path = "test_not_prune_output_spec_name_warning"
+        configs = paddle.SaveLoadConfig()
+        out = paddle.to_variable(np.random.random((1, 1)).astype('float'))
+        configs.output_spec = [out]
+        paddle.jit.save(layer, model_path, configs=configs)
+
+        self.verify_inference_correctness(layer, model_path)
+
+    def test_prune_input_spec_name_error(self):
+        layer = LinerNetWithLabel(784, 1)
+
+        model_path = "test_prune_input_spec_name_error"
+        with self.assertRaises(ValueError):
+            paddle.jit.save(
+                layer,
+                model_path,
+                input_spec=[InputSpec(
+                    shape=[None, 784], dtype='float32')])
+        with self.assertRaises(ValueError):
+            paddle.jit.save(
+                layer,
+                model_path,
+                input_spec=[
+                    InputSpec(
+                        shape=[None, 784], dtype='float32', name='feed_input')
+                ])
+
+    def test_prune_output_spec_name_error(self):
+        layer = LinerNetWithLabel(784, 1)
+
+        train_with_label(layer)
+
+        model_path = "test_prune_to_static_after_train"
+        configs = paddle.SaveLoadConfig()
+        out = paddle.to_variable(np.random.random((1, 1)).astype('float'))
+        configs.output_spec = [out]
+        with self.assertRaises(ValueError):
+            paddle.jit.save(
+                layer,
+                model_path,
+                input_spec=[
+                    InputSpec(
+                        shape=[None, 784], dtype='float32', name="image")
+                ],
+                configs=configs)
+
+
+class TestJitSaveLoadEmptyLayer(unittest.TestCase):
+    def setUp(self):
+        self.model_path = "model.jit_save_load_empty_layer"
+        # enable dygraph mode
+        paddle.disable_static()
+
+    def test_save_load_empty_layer(self):
+        layer = EmptyLayer()
+        x = paddle.to_variable(np.random.random((10)).astype('float32'))
+        out = layer(x)
+        paddle.jit.save(layer, self.model_path)
+        load_layer = paddle.jit.load(self.model_path)
+        load_out = load_layer(x)
+        self.assertTrue(np.array_equal(out, load_out))
+
+
+class TestJitSaveLoadNoParamLayer(unittest.TestCase):
+    def setUp(self):
+        self.model_path = "model.jit_save_load_no_param_layer"
+        # enable dygraph mode
+        paddle.disable_static()
+
+    def test_save_load_no_param_layer(self):
+        layer = NoParamLayer()
+        x = paddle.to_variable(np.random.random((5)).astype('float32'))
+        y = paddle.to_variable(np.random.random((5)).astype('float32'))
+        out = layer(x, y)
+        paddle.jit.save(layer, self.model_path)
+        load_layer = paddle.jit.load(self.model_path)
+        load_out = load_layer(x, y)
+        self.assertTrue(np.array_equal(out, load_out))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
index 8780727e4cb276a989a8d04d05c6419a4874e7f5..041fe4e9043d60852fcaab42bc233b63b39609ce 100644
--- a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
@@ -24,7 +24,10 @@ def kldiv_loss(x, target, reduction):
     loss = np.where(target >= 0, output, np.zeros_like(x))
 
     if reduction == "batchmean":
-        return loss.sum() / x.shape[0]
+        if len(x.shape) > 0:
+            return loss.sum() / x.shape[0]
+        else:
+            return loss.sum()
     if reduction == "mean":
         return loss.mean()
     if reduction == "sum":
@@ -93,6 +96,9 @@ class TestKLDivLossDygraph(unittest.TestCase):
     def test_kl_loss_batchmean(self):
         self.run_kl_loss('batchmean')
 
+    def test_kl_loss_batchmean_shape(self):
+        self.run_kl_loss('batchmean', ())
+
     def test_kl_loss_mean(self):
         self.run_kl_loss('mean')
 
diff --git a/python/paddle/fluid/tests/unittests/test_launch.sh b/python/paddle/fluid/tests/unittests/test_launch.sh
index 98c907a551965331f79d1635362213b43d867002..958d78246627d4cd2f826f74aeccff5ffe254034 100644
--- a/python/paddle/fluid/tests/unittests/test_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_launch.sh
@@ -48,9 +48,9 @@ if [ -f $file_1 ]; then
     rm $file_1
 fi
 
-
+# test use DISTRIBUTED_TRAINER_ENDPOINTS env in paddlecloud
 unset PADDLE_PORT
-unset TRAINER_PORTS_NUM
+export DISTRIBUTED_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171
 
 echo ""
 echo "paddle.distributed.launch async poll process test"
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index b76887f0965ca64b2b40bf9c0ce6e82b44fdad2f..26073f49bdd3d494da7b39346c5bafb2aefba56a 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -2677,13 +2677,6 @@ class TestBook(LayerTest):
             out = layers.sigmoid(input, name='sigmoid')
             return (out)
 
-    def make_logsigmoid(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.logsigmoid(input, name='logsigmoid')
-            return (out)
-
     def make_exp(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
@@ -3318,15 +3311,29 @@ class TestBook(LayerTest):
             return (out)
 
     def test_roi_pool(self):
-        # TODO(minqiyang): dygraph do not support lod now
+        x_np = np.random.rand(2, 3, 8, 8).astype('float32')
+        rois_np = np.random.rand(3, 4).astype('float32')
+        rois_num_np = np.array([1, 2]).astype('int32')
+
         with self.static_graph():
-            x = layers.data(name="x", shape=[256, 30, 30], dtype="float32")
-            rois = layers.data(
-                name="rois", shape=[4], dtype="float32", lod_level=1)
-            rois_lod = layers.data(
-                name="rois_lod", shape=[None, ], dtype="int", lod_level=1)
-            output = layers.roi_pool(x, rois, 7, 7, 0.6, rois_lod)
-            return (output)
+            x = layers.data(name="x", shape=[3, 8, 8], dtype="float32")
+            rois = layers.data(name="rois", shape=[4], dtype="float32")
+            rois_num = fluid.data(name="rois_num", shape=[None], dtype="int32")
+            output = layers.roi_pool(x, rois, 4, 4, 0.5, rois_num=rois_num)
+            static_res = self.get_static_graph_result(
+                feed={'x': x_np,
+                      'rois': rois_np,
+                      'rois_num': rois_num_np},
+                fetch_list=[output])[0]
+
+        with self.dynamic_graph():
+            x_dy = base.to_variable(x_np)
+            rois_dy = base.to_variable(rois_np)
+            rois_num_dy = base.to_variable(rois_num_np)
+            dy_res = layers.roi_pool(
+                x_dy, rois_dy, 4, 4, 0.5, rois_num=rois_num_dy)
+            dy_res_value = dy_res[0].numpy()
+        self.assertTrue(np.array_equal(static_res, dy_res_value))
 
     def test_sequence_enumerate(self):
         # TODO(minqiyang): dygraph do not support lod now
@@ -3335,16 +3342,29 @@ class TestBook(LayerTest):
             out = layers.sequence_enumerate(input=x, win_size=2, pad_value=0)
 
     def test_roi_align(self):
-        # TODO(minqiyang): dygraph do not support lod now
+        x_np = np.random.rand(2, 3, 8, 8).astype('float32')
+        rois_np = np.random.rand(3, 4).astype('float32')
+        rois_num_np = np.array([1, 2]).astype('int32')
+
         with self.static_graph():
-            x = layers.data(name="x", shape=[256, 30, 30], dtype="float32")
-            rois = layers.data(
-                name="rois", shape=[4], dtype="float32", lod_level=1)
-            rois_lod = layers.data(
-                name="rois_lod", shape=[None, ], dtype="int", lod_level=1)
-            output = layers.roi_align(x, rois, 14, 14, 0.5, 2, 'roi_align',
-                                      rois_lod)
-            return (output)
+            x = layers.data(name="x", shape=[3, 8, 8], dtype="float32")
+            rois = layers.data(name="rois", shape=[4], dtype="float32")
+            rois_num = fluid.data(name="rois_num", shape=[None], dtype="int32")
+            output = layers.roi_align(x, rois, 4, 4, 0.5, 2, rois_num=rois_num)
+            static_res = self.get_static_graph_result(
+                feed={'x': x_np,
+                      'rois': rois_np,
+                      'rois_num': rois_num_np},
+                fetch_list=[output])[0]
+
+        with self.dynamic_graph():
+            x_dy = base.to_variable(x_np)
+            rois_dy = base.to_variable(rois_np)
+            rois_num_dy = base.to_variable(rois_num_np)
+            dy_res = layers.roi_align(
+                x_dy, rois_dy, 4, 4, 0.5, 2, rois_num=rois_num_dy)
+            dy_res_value = dy_res.numpy()
+        self.assertTrue(np.array_equal(static_res, dy_res_value))
 
     def test_roi_perspective_transform(self):
         # TODO(minqiyang): dygraph do not support lod now
diff --git a/python/paddle/fluid/tests/unittests/test_linear_interp_op.py b/python/paddle/fluid/tests/unittests/test_linear_interp_op.py
index 53e8b02081ae3acf8a7fb5dd2bc6e05cbc3be901..c9948edad0061012cf028bec674a4bb713364541 100755
--- a/python/paddle/fluid/tests/unittests/test_linear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_interp_op.py
@@ -293,7 +293,7 @@ class TestLinearInterpOpAPI2_0(unittest.TestCase):
 
         # dygraph 
         x_data = np.random.random((1, 3, 128)).astype("float32")
-        us_1 = paddle.nn.UpSample(
+        us_1 = paddle.nn.Upsample(
             size=[64, ],
             mode='linear',
             align_mode=1,
@@ -385,19 +385,19 @@ class TestLinearInterpOpError(unittest.TestCase):
 
             def input_shape_error():
                 x1 = fluid.data(name="x1", shape=[1], dtype="float32")
-                out1 = paddle.nn.UpSample(
+                out1 = paddle.nn.Upsample(
                     size=[256, ], data_format='NCW', mode='linear')
                 out1_res = out1(x1)
 
             def data_format_error():
                 x2 = fluid.data(name="x2", shape=[1, 3, 128], dtype="float32")
-                out2 = paddle.nn.UpSample(
+                out2 = paddle.nn.Upsample(
                     size=[256, ], data_format='NHWCD', mode='linear')
                 out2_res = out2(x2)
 
             def out_shape_error():
                 x3 = fluid.data(name="x3", shape=[1, 3, 128], dtype="float32")
-                out3 = paddle.nn.UpSample(
+                out3 = paddle.nn.Upsample(
                     size=[
                         256,
                         256,
diff --git a/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py
index 04b56677fc158583fe79ec0dc1276210bd2ebbdc..b34989f5f5c79dfd27158f120175824389ac9731 100755
--- a/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py
@@ -26,6 +26,7 @@ from paddle.nn.functional import interpolate
 
 def linear_interp_np(input,
                      out_w,
+                     scale_w=0,
                      out_size=None,
                      actual_shape=None,
                      align_corners=True,
@@ -44,7 +45,10 @@ def linear_interp_np(input,
         if (align_corners):
             ratio_w = (in_w - 1.0) / (out_w - 1.0)
         else:
-            ratio_w = 1.0 * in_w / out_w
+            if scale_w > 0:
+                ratio_w = 1.0 / scale_w
+            else:
+                ratio_w = 1.0 * in_w / out_w
 
     out = np.zeros((batch_size, channel, out_w))
 
@@ -81,6 +85,7 @@ class TestLinearInterpOp(OpTest):
         self.op_type = "linear_interp_v2"
         input_np = np.random.random(self.input_shape).astype("float64")
 
+        scale_w = 0
         if self.data_layout == "NCHW":
             in_w = self.input_shape[2]
         else:
@@ -95,7 +100,7 @@ class TestLinearInterpOp(OpTest):
         else:
             out_w = self.out_w
 
-        output_np = linear_interp_np(input_np, out_w, self.out_size,
+        output_np = linear_interp_np(input_np, out_w, self.scale, self.out_size,
                                      self.actual_shape, self.align_corners,
                                      self.align_mode, self.data_layout)
         self.inputs = {'X': input_np}
@@ -195,7 +200,7 @@ class TestLinearInterpOpSizeTensor(TestLinearInterpOp):
         else:
             out_w = self.out_w
 
-        output_np = linear_interp_np(input_np, out_w, self.out_size,
+        output_np = linear_interp_np(input_np, out_w, 0, self.out_size,
                                      self.actual_shape, self.align_corners,
                                      self.align_mode, self.data_layout)
 
@@ -309,7 +314,7 @@ class TestLinearInterpOpAPI2_0(unittest.TestCase):
 
         # dygraph 
         x_data = np.random.random((1, 3, 128)).astype("float32")
-        us_1 = paddle.nn.UpSample(
+        us_1 = paddle.nn.Upsample(
             size=[64, ],
             mode='linear',
             align_mode=1,
@@ -342,7 +347,7 @@ class TestResizeLinearOpUint8(OpTest):
         else:
             out_w = self.out_w
 
-        output_np = linear_interp_np(input_np, out_w, self.out_size,
+        output_np = linear_interp_np(input_np, out_w, 0, self.out_size,
                                      self.actual_shape, self.align_corners,
                                      self.align_mode)
         self.inputs = {'X': input_np}
@@ -410,19 +415,19 @@ class TestLinearInterpOpError(unittest.TestCase):
 
             def input_shape_error():
                 x1 = fluid.data(name="x1", shape=[1], dtype="float32")
-                out1 = paddle.nn.UpSample(
+                out1 = paddle.nn.Upsample(
                     size=[256, ], data_format='NCW', mode='linear')
                 out1_res = out1(x1)
 
             def data_format_error():
                 x2 = fluid.data(name="x2", shape=[1, 3, 128], dtype="float32")
-                out2 = paddle.nn.UpSample(
+                out2 = paddle.nn.Upsample(
                     size=[256, ], data_format='NHWCD', mode='linear')
                 out2_res = out2(x2)
 
             def out_shape_error():
                 x3 = fluid.data(name="x3", shape=[1, 3, 128], dtype="float32")
-                out3 = paddle.nn.UpSample(
+                out3 = paddle.nn.Upsample(
                     size=[
                         256,
                         256,
diff --git a/python/paddle/fluid/tests/unittests/test_logsumexp.py b/python/paddle/fluid/tests/unittests/test_logsumexp.py
index c2201a52605bc87246fb9c8734494b19f83ff180..cf9203dffcbaa5da641b3f7cb8925ac9efcbe115 100644
--- a/python/paddle/fluid/tests/unittests/test_logsumexp.py
+++ b/python/paddle/fluid/tests/unittests/test_logsumexp.py
@@ -46,8 +46,8 @@ class TestLogsumexp(OpTest):
         self.inputs = {'X': x}
         self.outputs = {'Out': out}
         self.attrs = {
-            'dim': self.axis,
-            'keep_dim': self.keepdim,
+            'axis': self.axis,
+            'keepdim': self.keepdim,
             'reduce_all': self.reduce_all
         }
 
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
index 1f3dab67f2afe4e2b0a655634bb808ad0951ae9e..29a0fa55f7729bc39b2e9202397563a5cb10747c 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
@@ -400,7 +400,8 @@ class TestCUDNNLstmOp(OpTest):
             'Input': input,
             'W': flat_w,
             'InitH': init_h,
-            'InitC': init_c
+            'InitC': init_c,
+            'SequenceLength': self.sequence_length
         }
         self.attrs = {
             'dropout_prob': 0.0,
@@ -408,7 +409,6 @@ class TestCUDNNLstmOp(OpTest):
             'input_size': input_size,
             'hidden_size': hidden_size,
             'num_layers': 1,
-            'sequence_length': self.sequence_length.tolist()
         }
         self.outputs = {
             'Out': output,
@@ -436,13 +436,6 @@ class TestCUDNNLstmOp(OpTest):
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNLstmOp2(TestCUDNNLstmOp):
-    def set_attrs(self):
-        self.sequence_length = np.array([], dtype=np.int32)
-
-
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
-class TestCUDNNLstmOp3(TestCUDNNLstmOp):
     def set_attrs(self):
         self.num_layers = 2
 
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch.py b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
index 00137f63e244a0e166047e89f9ef436da158ed16..f6eff22d6ce5f06d8853d6244f79b4b07b3fa4f5 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
@@ -189,15 +189,15 @@ class TestMathOpPatches(unittest.TestCase):
     @prog_scope()
     def test_integer_div(self):
         a = fluid.layers.data(name="a", shape=[1], dtype='int64')
-        b = a / 2
+        b = a / 7
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
-        a_np = numpy.array([3, 4, 10, 14, 9, 18])
+        a_np = numpy.array([3, 4, 10, 14, 9, 18]).astype('int64')
         b_np, = exe.run(fluid.default_main_program(),
                         feed={"a": a_np},
                         fetch_list=[b])
-        # for paddle2.0, use true_divide
-        b_np_actual = (a_np / 2.0)
+
+        b_np_actual = (a_np / 7).astype('int64')
         self.assertTrue(numpy.array_equal(b_np, b_np_actual))
 
     @prog_scope()
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
index 9bb12d546550a821e8a133dd9c91d5d41a50b1b2..a70862f40197c513a0cd04753553264708ee2a1c 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
@@ -307,7 +307,7 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
             np.array_equal(x.sigmoid().numpy(), fluid.layers.sigmoid(x).numpy(
             )))
         self.assertTrue(
-            np.array_equal(x.logsigmoid().numpy(),
+            np.array_equal(x.log_sigmoid().numpy(),
                            fluid.layers.logsigmoid(x).numpy()))
         self.assertTrue(np.array_equal(x.exp().numpy(), paddle.exp(x).numpy()))
         self.assertTrue(
diff --git a/python/paddle/fluid/tests/unittests/test_monitor.py b/python/paddle/fluid/tests/unittests/test_monitor.py
index f6207edb41c190ac51dfe67dad22bb0191a67a07..cf273876b1f2f8a9b4828375ca6e20e591feb306 100644
--- a/python/paddle/fluid/tests/unittests/test_monitor.py
+++ b/python/paddle/fluid/tests/unittests/test_monitor.py
@@ -52,18 +52,17 @@ class TestDatasetWithStat(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset._set_batch_size(32)
+        dataset._set_thread(3)
         dataset.set_filelist([
             "test_in_memory_dataset_run_a.txt",
             "test_in_memory_dataset_run_b.txt"
         ])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
+        dataset._set_pipe_command("cat")
+        dataset._set_use_var(slots_vars)
         dataset.load_into_memory()
-        dataset.set_fea_eval(1, True)
+        dataset._set_fea_eval(1, True)
         dataset.slots_shuffle(["slot1"])
 
         exe = fluid.Executor(fluid.CPUPlace())
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
index 3a8867f6bd29f5bc0e512f9c8b22ecf192253fc7..6fd14b40bc9108b6075a0ac1f40cbefd79b8f0d9 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
@@ -27,6 +27,7 @@ import paddle.fluid.core as core
 from paddle.io import Dataset, IterableDataset, BatchSampler, DataLoader
 from paddle.fluid.dygraph.nn import Linear
 from paddle.fluid.dygraph.base import to_variable
+from paddle.fluid.dataloader.dataloader_iter import _worker_loop
 
 
 class RandomDataset(Dataset):
@@ -185,9 +186,10 @@ class TestDataLoaderWorkerLoop(unittest.TestCase):
                 for i in range(10):
                     indices_queue.put([i, i + 10])
                 indices_queue.put(None)
-                loader._worker_loop(
-                    loader._dataset, 0, indices_queue, loader._data_queue,
-                    loader._workers_done_event, _collate_fn, _init_fn, 0, 1)
+                _worker_loop(loader._dataset, 0, indices_queue,
+                             loader._data_queue, loader._workers_done_event,
+                             _collate_fn, _init_fn, 0, 1,
+                             loader._use_shared_memory)
                 self.assertTrue(False)
         except AssertionError:
             pass
@@ -228,9 +230,10 @@ class TestDataLoaderWorkerLoop(unittest.TestCase):
                     indices_queue.put([i, i + 10])
                 indices_queue.put(None)
                 loader._workers_done_event.set()
-                loader._worker_loop(
-                    loader._dataset, 0, indices_queue, loader._data_queue,
-                    loader._workers_done_event, _collate_fn, _init_fn, 0, 1)
+                _worker_loop(loader._dataset, 0, indices_queue,
+                             loader._data_queue, loader._workers_done_event,
+                             _collate_fn, _init_fn, 0, 1,
+                             loader._use_shared_memory)
                 self.assertTrue(True)
         except AssertionError:
             pass
diff --git a/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
index 19da09a463f3cc6224a22eb90278abae9ec59b91..2feca1c30689cec20e1d696cc672516414786038 100755
--- a/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
@@ -26,6 +26,8 @@ import paddle
 def nearest_neighbor_interp_np(X,
                                out_h,
                                out_w,
+                               scale_h=0,
+                               scale_w=0,
                                out_size=None,
                                actual_shape=None,
                                align_corners=True,
@@ -46,13 +48,18 @@ def nearest_neighbor_interp_np(X,
         if (align_corners):
             ratio_h = (in_h - 1.0) / (out_h - 1.0)
         else:
-            ratio_h = 1.0 * in_h / out_h
+            if scale_h > 0:
+                ratio_h = 1.0 / scale_h
+            else:
+                ratio_h = 1.0 * in_h / out_h
     if (out_w > 1):
         if (align_corners):
             ratio_w = (in_w - 1.0) / (out_w - 1.0)
         else:
-            ratio_w = 1.0 * in_w / out_w
-
+            if scale_w > 0:
+                ratio_w = 1.0 / scale_w
+            else:
+                ratio_w = 1.0 * in_w / out_w
     out = np.zeros((n, c, out_h, out_w))
 
     if align_corners:
@@ -89,7 +96,8 @@ class TestNearestInterpOp(OpTest):
         else:
             in_h = self.input_shape[1]
             in_w = self.input_shape[2]
-
+        scale_h = 0
+        scale_w = 0
         if self.scale:
             if isinstance(self.scale, float) or isinstance(self.scale, int):
                 if self.scale > 0:
@@ -106,8 +114,8 @@ class TestNearestInterpOp(OpTest):
             out_w = self.out_w
 
         output_np = nearest_neighbor_interp_np(
-            input_np, out_h, out_w, self.out_size, self.actual_shape,
-            self.align_corners, self.data_layout)
+            input_np, out_h, out_w, scale_h, scale_w, self.out_size,
+            self.actual_shape, self.align_corners, self.data_layout)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -265,7 +273,7 @@ class TestNearestInterpOpUint8(OpTest):
             out_h = self.out_h
             out_w = self.out_w
 
-        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w,
+        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w, 0, 0,
                                                self.out_size, self.actual_shape,
                                                self.align_corners)
         self.inputs = {'X': input_np}
@@ -408,7 +416,7 @@ class TestNearestInterpOp_attr_tensor(OpTest):
             if isinstance(self.scale, list) and len(self.scale) == 1:
                 self.scale = [self.scale[0], self.scale[0]]
             self.attrs['scale'] = self.scale
-        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w,
+        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w, 0, 0,
                                                self.out_size, self.actual_shape,
                                                self.align_corners)
         self.outputs = {'Out': output_np}
diff --git a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py
index e0edf9019356f38eb3c74b9cadfa6ae575e9b823..43a0d481b28fdc47dec52fe9763dd920fd5a76a2 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py
@@ -16,20 +16,49 @@ from __future__ import print_function
 
 import unittest
 
+import paddle
+import paddle.nn as nn
+import numpy as np
+
+paddle.disable_static()
+
 
 class EmbeddingDygraph(unittest.TestCase):
     def test_1(self):
-        import paddle
-        import paddle.nn as nn
-        import numpy as np
-        paddle.disable_static()
+        x_data = np.arange(3, 6).reshape((3, 1)).astype(np.int64)
+        y_data = np.arange(6, 12).reshape((3, 2)).astype(np.float32)
+        paddle.disable_static(paddle.CPUPlace())
+        x = paddle.to_tensor(x_data, stop_gradient=False)
+        y = paddle.to_tensor(y_data, stop_gradient=False)
+
+        embedding = paddle.nn.Embedding(10, 3, sparse=True)
+
+        w0 = np.full(shape=(10, 3), fill_value=2).astype(np.float32)
+        embedding.weight.set_value(w0)
+
+        adam = paddle.optimizer.Adam(
+            parameters=[embedding.weight], learning_rate=0.01)
+        adam.clear_grad()
+
+        out = embedding(x)
+        out.backward()
+        adam.step()
+
+    def test_2(self):
+        x_data = np.arange(3, 6).reshape((3, 1)).astype(np.int64)
+        y_data = np.arange(6, 12).reshape((3, 2)).astype(np.float32)
+        paddle.disable_static(paddle.CPUPlace())
+        x = paddle.to_tensor(x_data, stop_gradient=False)
+        y = paddle.to_tensor(y_data, stop_gradient=False)
+
+        with self.assertRaises(ValueError):
+            embedding = paddle.nn.Embedding(10, 3, padding_idx=11, sparse=True)
 
-        # example 1
-        inp_word = np.array([[2, 3, 5], [4, 2, 1]]).astype('int64')
-        inp_word.shape  # [2, 3]
-        dict_size = 20
+        with self.assertRaises(ValueError):
+            embedding = paddle.nn.Embedding(-1, 3, sparse=True)
 
-        emb = nn.Embedding(dict_size, 32, weight_attr='emb.w', sparse=False)
+        with self.assertRaises(ValueError):
+            embedding = paddle.nn.Embedding(10, -3, sparse=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py
index c9c91ceb39de42c44f9ce81658aa79b896999552..4af0cce12b7334857c54ae9e8e9418848275ff32 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py
@@ -73,8 +73,13 @@ class EmbeddingStatic(unittest.TestCase):
                     dtype="int32")
 
                 emb = functional.embedding(
-                    x=label, weight=weight, sparse=True, name="embedding")
+                    x=label,
+                    weight=weight,
+                    padding_idx=129,
+                    sparse=True,
+                    name="embedding")
 
+        with self.assertRaises(ValueError):
             test_bad_x()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py
index c6cfe01dce40458684c7464ca5ebddd389c62cbe..5d1e016287e07a8505336e6cb447c0e1b29a2ec2 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
@@ -101,6 +102,29 @@ class TestReduceMeanWithDimDoubleGradCheck(unittest.TestCase):
             self.func(p)
 
 
+class TestReduceSumWithDimDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        shape = [7, 11]
+        eps = 0.05
+        dtype = np.float64
+
+        x = layers.data('x', shape, False, dtype)
+        x.persistable = True
+        y = layers.reduce_sum(x, dim=0)
+        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x], y, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 class TestMulDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
@@ -130,6 +154,30 @@ class TestMulDoubleGradCheck(unittest.TestCase):
 
 
 class TestReshapeDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        x_shape = [3, 12]
+        expand_times = [4, 9]
+        eps = 0.005
+        dtype = np.float64
+
+        x = layers.data('x', x_shape, False, dtype)
+        x.persistable = True
+        out = layers.expand(x, expand_times)
+        x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x], out, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestExpandDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         x_shape = [3, 12]
@@ -153,5 +201,53 @@ class TestReshapeDoubleGradCheck(unittest.TestCase):
             self.func(p)
 
 
+class TestTileDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        x_shape = [3, 12]
+        repeat_times = [4, 9]
+        eps = 0.005
+        dtype = np.float64
+
+        x = layers.data('x', x_shape, False, dtype)
+        x.persistable = True
+        out = paddle.tile(x, repeat_times)
+        x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x], out, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestExpandV2DoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        x_shape = [1, 12]
+        new_shape = [4, 12]
+        eps = 0.005
+        dtype = np.float64
+
+        x = layers.data('x', x_shape, False, dtype)
+        x.persistable = True
+        out = paddle.expand(x, new_shape)
+        x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x], out, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_norm_all.py b/python/paddle/fluid/tests/unittests/test_norm_all.py
index c047cf6ddff78641b918de75a284574175bb3bca..352089e1fb75fa4c3423d29012fd85c3d611c81b 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_all.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_all.py
@@ -26,11 +26,11 @@ def p_norm(x, axis, porder, keepdims=False):
     if axis is None:
         x = x.flatten()
         if porder == np.inf:
-            r = np.amax(np.abs(x))
+            r = np.amax(np.abs(x), keepdims=keepdims)
         elif porder == -np.inf:
-            r = np.amin(np.abs(x))
+            r = np.amin(np.abs(x), keepdims=keepdims)
         else:
-            r = np.linalg.norm(x, ord=porder)
+            r = np.linalg.norm(x, ord=porder, keepdims=keepdims)
     elif isinstance(axis, list or tuple) and len(axis) == 2:
         if porder == np.inf:
             axis = tuple(axis)
@@ -41,10 +41,10 @@ def p_norm(x, axis, porder, keepdims=False):
         elif porder == 0:
             axis = tuple(axis)
             r = x.astype(bool)
-            r = np.sum(r, axis)
+            r = np.sum(r, axis, keepdims=keepdims)
         elif porder == 1:
             axis = tuple(axis)
-            r = np.sum(np.abs(x), axis)
+            r = np.sum(np.abs(x), axis, keepdims=keepdims)
         else:
             axis = tuple(axis)
             xp = np.power(np.abs(x), porder)
@@ -61,7 +61,7 @@ def p_norm(x, axis, porder, keepdims=False):
 
 def frobenius_norm(x, axis=None, keepdims=False):
     if isinstance(axis, list): axis = tuple(axis)
-    if axis is None: axis = (-2, -1)
+    if axis is None: x = x.reshape(1, x.size)
     r = np.linalg.norm(
         x, ord='fro', axis=axis, keepdims=keepdims).astype(x.dtype)
     return r
@@ -217,28 +217,37 @@ class TestPnormOp5(TestPnormOp):
         self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
 
 
-def run_fro(self, p, axis, shape_x, dtype):
+def run_fro(self, p, axis, shape_x, dtype, keep_dim, check_dim=False):
     with fluid.program_guard(fluid.Program()):
         data = fluid.data(name="X", shape=shape_x, dtype=dtype)
-        out = paddle.norm(x=data, p=p, axis=axis)
+        out = paddle.norm(x=data, p=p, axis=axis, keepdim=keep_dim)
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         np_input = (np.random.rand(*shape_x) + 1.0).astype(dtype)
-        expected_result = frobenius_norm(np_input, axis=axis)
+        expected_result = frobenius_norm(np_input, axis=axis, keepdims=keep_dim)
         result, = exe.run(feed={"X": np_input}, fetch_list=[out])
     self.assertEqual((np.abs(result - expected_result) < 1e-6).all(), True)
+    if keep_dim and check_dim:
+        self.assertEqual(
+            (np.abs(np.array(result.shape) - np.array(expected_result.shape)) <
+             1e-6).all(), True)
 
 
-def run_pnorm(self, p, axis, shape_x, dtype):
+def run_pnorm(self, p, axis, shape_x, dtype, keep_dim, check_dim=False):
     with fluid.program_guard(fluid.Program()):
         data = fluid.data(name="X", shape=shape_x, dtype=dtype)
-        out = paddle.norm(x=data, p=p, axis=axis)
+        out = paddle.norm(x=data, p=p, axis=axis, keepdim=keep_dim)
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         np_input = (np.random.rand(*shape_x) + 1.0).astype(dtype)
-        expected_result = p_norm(np_input, porder=p, axis=axis).astype(dtype)
+        expected_result = p_norm(
+            np_input, porder=p, axis=axis, keepdims=keep_dim).astype(dtype)
         result, = exe.run(feed={"X": np_input}, fetch_list=[out])
-        self.assertEqual((np.abs(result - expected_result) < 1e-6).all(), True)
+    self.assertEqual((np.abs(result - expected_result) < 1e-6).all(), True)
+    if keep_dim and check_dim:
+        self.assertEqual(
+            (np.abs(np.array(result.shape) - np.array(expected_result.shape)) <
+             1e-6).all(), True)
 
 
 def run_graph(self, p, axis, shape_x, dtype):
@@ -253,6 +262,7 @@ def run_graph(self, p, axis, shape_x, dtype):
 
     # compute frobenius norm along last two dimensions.
     out_fro = paddle.norm(x, p='fro')
+    out_fro = paddle.norm(x, p='fro', axis=0)
     out_fro = paddle.norm(x, p='fro', axis=[0, 1])
     # compute 2-order  norm along [0,1] dimension.
     out_pnorm = paddle.norm(x, p=2, axis=[0, 1])
@@ -274,27 +284,133 @@ def run_graph(self, p, axis, shape_x, dtype):
 
 class API_NormTest(unittest.TestCase):
     def test_basic(self):
-        run_fro(self, p='fro', axis=None, shape_x=[2, 3, 4], dtype="float32")
-        run_fro(self, p='fro', axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(self, p=2, axis=None, shape_x=[3, 4], dtype="float32")
-        run_pnorm(self, p=2, axis=1, shape_x=[3, 4], dtype="float64")
-        run_pnorm(self, p=np.inf, axis=0, shape_x=[2, 3, 4], dtype="float32")
-        run_pnorm(self, p=np.inf, axis=None, shape_x=[2, 3, 4], dtype="float32")
-        run_pnorm(self, p=-np.inf, axis=0, shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(
-            self, p=-np.inf, axis=None, shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(self, p=0, axis=1, shape_x=[3, 4], dtype="float64")
-
-        run_pnorm(self, p=1, axis=1, shape_x=[3, 4], dtype="float64")
-        run_pnorm(self, p=0, axis=None, shape_x=[3, 4], dtype="float64")
-        run_pnorm(self, p=2, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(self, p=2, axis=-1, shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(self, p=1, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(self, p=0, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(
-            self, p=np.inf, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(
-            self, p=-np.inf, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
+        keep_dims = {False, True}
+        for keep in keep_dims:
+            run_fro(
+                self,
+                p='fro',
+                axis=None,
+                shape_x=[2, 3, 4],
+                dtype="float32",
+                keep_dim=keep)
+            run_fro(
+                self,
+                p='fro',
+                axis=[0, 1],
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=2,
+                axis=None,
+                shape_x=[3, 4],
+                dtype="float32",
+                keep_dim=keep)
+            run_pnorm(
+                self,
+                p=2,
+                axis=1,
+                shape_x=[3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=np.inf,
+                axis=0,
+                shape_x=[2, 3, 4],
+                dtype="float32",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=np.inf,
+                axis=None,
+                shape_x=[2, 3, 4],
+                dtype="float32",
+                keep_dim=keep)
+            run_pnorm(
+                self,
+                p=-np.inf,
+                axis=0,
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=-np.inf,
+                axis=None,
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep)
+            run_pnorm(
+                self,
+                p=0,
+                axis=1,
+                shape_x=[3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+
+            run_pnorm(
+                self,
+                p=1,
+                axis=1,
+                shape_x=[3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=0,
+                axis=None,
+                shape_x=[3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=2,
+                axis=[0, 1],
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=2,
+                axis=-1,
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=1,
+                axis=[0, 1],
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=np.inf,
+                axis=[0, 1],
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=-np.inf,
+                axis=[0, 1],
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
 
     def test_dygraph(self):
         run_graph(self, p='fro', axis=None, shape_x=[2, 3, 4], dtype="float32")
@@ -315,6 +431,7 @@ class API_NormTest(unittest.TestCase):
                 paddle.norm(data, p=p, out=out)
 
             self.assertRaises(TypeError, err_dtype, "fro", [2, 2], "int64")
+            self.assertRaises(ValueError, paddle.norm, "inf", [2], "int64")
             out = fluid.data(name="out", shape=[1], dtype="int64")
             self.assertRaises(TypeError, err_dtype, "fro", [2, 2], "float64",
                               out)
@@ -325,6 +442,7 @@ class API_NormTest(unittest.TestCase):
             self.assertRaises(ValueError, paddle.norm, data, p="unsupport norm")
             self.assertRaises(ValueError, paddle.norm, data, p=[1])
             self.assertRaises(ValueError, paddle.norm, data, p=[1], axis=-1)
+            self.assertRaises(ValueError, paddle.norm, 0, [1, 0], "float64")
             data = fluid.data(name="data_3d", shape=[2, 2, 2], dtype="float64")
             self.assertRaises(
                 ValueError, paddle.norm, data, p='unspport', axis=[-3, -2, -1])
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
index 4b2914c223a08c52444e085f0ef9e41518694593..c1992d0d539a5c6499b9b8d022b88997729ef782 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
@@ -261,7 +261,13 @@ class TestMultiOptimizersMultiCardsError(unittest.TestCase):
             exe.run(startup_program)
 
             np.random.seed(SEED)
+
+            # NOTE(liym27):
+            # This test needs to run in multi cards to test NotImplementedError.
+            # Here, move this test from RUN_TYPE=DIST in tests/unittests/CMakeList.txt,
+            # to use multi cards ** only on CPU ** not GPU to reduce CI time.
             os.environ['CPU_NUM'] = str(2)
+
             pe_exe = fluid.ParallelExecutor(
                 use_cuda=use_cuda,
                 main_program=main_program,
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
index 6671a2def3cccd2acd76025e73486b06b4bb1471..ea59a7f584a2dd5a06d37ede160ace130fc93580 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -176,7 +176,7 @@ class TestCRFModel(unittest.TestCase):
                     place=fluid.CPUPlace())
 
             data = train_data()
-            for i in range(10):
+            for i in range(4):
                 cur_batch = next(data)
                 print(exe.run(train_cp,
                               feed=feeder.feed(cur_batch),
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_api.py b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
index a77f1cdd57d7bade92e2a4f914dc3d91624d4845..505a1c738384194032329f66c33fa27e3ed3045c 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
@@ -165,7 +165,6 @@ class TestPool3d_API(unittest.TestCase):
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
     def check_max_dygraph_ndhwc_results(self, place):
-        print("run ndchw max pool3d")
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
             input = fluid.dygraph.to_variable(
@@ -190,7 +189,6 @@ class TestPool3d_API(unittest.TestCase):
                     np.transpose(result.numpy(), [0, 4, 1, 2, 3]), result_np))
 
     def check_max_dygraph_ceilmode_results(self, place):
-        print("run ceil mode max pool3d")
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
diff --git a/python/paddle/fluid/tests/unittests/test_regularizer_api.py b/python/paddle/fluid/tests/unittests/test_regularizer_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..76186d2e39feafe772fce6cc7f9099e97d833232
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_regularizer_api.py
@@ -0,0 +1,204 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+from functools import partial
+import contextlib
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+import paddle.fluid.optimizer as optimizer
+import paddle.regularizer as regularizer
+from paddle.fluid.backward import append_backward
+
+
+def bow_net(data,
+            label,
+            dict_dim,
+            is_sparse=False,
+            emb_dim=8,
+            hid_dim=8,
+            hid_dim2=6,
+            class_dim=2):
+    """
+    BOW net
+    This model is from https://github.com/PaddlePaddle/models:
+    fluid/PaddleNLP/text_classification/nets.py
+    """
+    emb = fluid.layers.embedding(
+        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim])
+    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+    bow_tanh = fluid.layers.tanh(bow)
+    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
+    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
+    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    return avg_cost
+
+
+class TestRegularizer(unittest.TestCase):
+    def setUp(self):
+        self.word_dict = paddle.dataset.imdb.word_dict()
+        reader = paddle.batch(
+            paddle.dataset.imdb.train(self.word_dict), batch_size=1)()
+        self.train_data = [next(reader) for _ in range(1)]
+
+    def get_places(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        return places
+
+    @contextlib.contextmanager
+    def scope_prog_guard(self, main_prog, startup_prog):
+        scope = fluid.core.Scope()
+        with fluid.unique_name.guard():
+            with fluid.scope_guard(scope):
+                with fluid.program_guard(main_prog, startup_prog):
+                    yield
+
+    def run_program(self, place, feed_list):
+        exe = fluid.Executor(place)
+        feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
+        exe.run(fluid.default_startup_program())
+
+        main_prog = fluid.default_main_program()
+        param_list = [var.name for var in main_prog.block(0).all_parameters()]
+
+        param_sum = []
+        for data in self.train_data:
+            out = exe.run(main_prog,
+                          feed=feeder.feed(data),
+                          fetch_list=param_list)
+            p_sum = 0
+            for v in out:
+                p_sum += np.sum(np.abs(v))
+            param_sum.append(p_sum)
+        return param_sum
+
+    def check_l2decay_regularizer(self, place, model):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
+        main_prog = fluid.framework.Program()
+        startup_prog = fluid.framework.Program()
+        with self.scope_prog_guard(
+                main_prog=main_prog, startup_prog=startup_prog):
+            data = fluid.layers.data(
+                name="words", shape=[1], dtype="int64", lod_level=1)
+            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+            avg_cost = model(data, label, len(self.word_dict))
+
+            optimizer = fluid.optimizer.Adagrad(
+                learning_rate=0.1,
+                regularization=paddle.regularizer.L2Decay(1.0))
+            optimizer.minimize(avg_cost)
+            param_sum = self.run_program(place, [data, label])
+        return param_sum
+
+    def check_l2decay(self, place, model):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
+        main_prog = fluid.framework.Program()
+        startup_prog = fluid.framework.Program()
+
+        with self.scope_prog_guard(
+                main_prog=main_prog, startup_prog=startup_prog):
+            data = fluid.layers.data(
+                name="words", shape=[1], dtype="int64", lod_level=1)
+            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+            avg_cost_l2 = model(data, label, len(self.word_dict))
+
+            param_list = fluid.default_main_program().block(0).all_parameters()
+            para_sum = []
+            for para in param_list:
+                para_mul = fluid.layers.square(x=para)
+                para_sum.append(fluid.layers.reduce_sum(input=para_mul))
+            avg_cost_l2 += fluid.layers.sums(para_sum) * .5
+
+            optimizer = fluid.optimizer.Adagrad(learning_rate=0.1)
+            optimizer.minimize(avg_cost_l2)
+            param_sum = self.run_program(place, [data, label])
+        return param_sum
+
+    def test_l2(self):
+        for place in self.get_places():
+            dense_sparse_p_sum = []
+            for sparse in [True, False]:
+                model = partial(bow_net, is_sparse=sparse)
+                framework_l2 = self.check_l2decay_regularizer(place, model)
+                l2 = self.check_l2decay(place, model)
+                assert len(l2) == len(framework_l2)
+                for i in range(len(l2)):
+                    assert np.isclose(a=framework_l2[i], b=l2[i], rtol=5e-5)
+                dense_sparse_p_sum.append(framework_l2)
+
+            assert len(dense_sparse_p_sum[0]) == len(dense_sparse_p_sum[1])
+            for i in range(len(dense_sparse_p_sum[0])):
+                assert np.isclose(
+                    a=dense_sparse_p_sum[0][i],
+                    b=dense_sparse_p_sum[1][i],
+                    rtol=5e-5)
+
+    def test_repeated_regularization(self):
+        l1 = paddle.regularizer.L1Decay(0.1)
+        l2 = paddle.regularizer.L2Decay(0.01)
+        fc_param_attr = fluid.ParamAttr(regularizer=l1)
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.layers.uniform_random([2, 2, 3])
+            out = fluid.layers.fc(x, 5, param_attr=fc_param_attr)
+            loss = fluid.layers.reduce_sum(out)
+            sgd = fluid.optimizer.SGD(learning_rate=0.1, regularization=l2)
+            sgd.minimize(loss)
+        with fluid.dygraph.guard():
+            input = fluid.dygraph.to_variable(
+                np.random.randn(3, 2).astype('float32'))
+            paddle.manual_seed(1)
+            paddle.framework.random._manual_program_seed(1)
+
+            linear1 = fluid.dygraph.Linear(
+                2, 2, param_attr=fc_param_attr, bias_attr=fc_param_attr)
+            linear2 = fluid.dygraph.Linear(
+                2, 2, param_attr=fc_param_attr, bias_attr=fc_param_attr)
+
+            loss1 = linear1(input)
+            loss1.backward()
+            # set l2 regularizer in optimizer, but l1 in fluid.ParamAttr
+
+            fluid.optimizer.SGD(parameter_list=linear1.parameters(),
+                                learning_rate=1e-2,
+                                regularization=l2).minimize(loss1)
+            # only set l1 in fluid.ParamAttr
+            loss2 = linear2(input)
+            loss2.backward()
+            fluid.optimizer.SGD(parameter_list=linear2.parameters(),
+                                learning_rate=1e-2).minimize(loss2)
+            # they should both be applied by l1, and keep the same
+            self.assertTrue(
+                np.allclose(linear1.weight.numpy(), linear2.weight.numpy()),
+                "weight should use the regularization in fluid.ParamAttr!")
+            self.assertTrue(
+                np.allclose(linear1.bias.numpy(), linear2.bias.numpy()),
+                "bias should use the regularization in fluid.ParamAttr!")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
index 7e2ef36c1a7fda5c31049ec9c752c5226bfb89dc..6ca194b2694b6c7537ceb94e11eb1a1a0aeb8d8d 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
@@ -248,8 +248,7 @@ class PolicyGradient(object):
             func=reward_func, x=[action, length], out=reward)
         neg_log_prob = layers.cross_entropy(act_prob, action)
         cost = neg_log_prob * reward
-        cost = (layers.reduce_sum(cost) /
-                layers.cast(layers.reduce_sum(length), "float32")
+        cost = (layers.reduce_sum(cost) / layers.reduce_sum(length)
                 ) if length is not None else layers.reduce_mean(cost)
         optimizer = fluid.optimizer.Adam(self.lr)
         optimizer.minimize(cost)
diff --git a/python/paddle/fluid/tests/unittests/test_roi_align_op.py b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
index b01863880866e247f2aee4b94ae3121c9d891f92..fb8a090b80700d9b884a72f7f430723754523a13 100644
--- a/python/paddle/fluid/tests/unittests/test_roi_align_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
@@ -181,16 +181,11 @@ class TestROIAlignInLodOp(TestROIAlignOp):
         self.calc_roi_align()
 
         seq_len = self.rois_lod[0]
-        cur_len = 0
-        lod = [cur_len]
-        for l in seq_len:
-            cur_len += l
-            lod.append(cur_len)
 
         self.inputs = {
             'X': self.x,
             'ROIs': (self.rois[:, 1:5], self.rois_lod),
-            'RoisLod': np.asarray(lod).astype('int64')
+            'RoisNum': np.asarray(seq_len).astype('int32')
         }
 
         self.attrs = {
diff --git a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
index 1200b0e3470f650dce4365ee46458c8184281292..c6622cf8d9ce8ae655a6b2e5c130ed9990fd2a5b 100644
--- a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
@@ -174,16 +174,11 @@ class TestROIPoolInLodOp(TestROIPoolOp):
         self.calc_roi_pool()
 
         seq_len = self.rois_lod[0]
-        cur_len = 0
-        lod = [cur_len]
-        for l in seq_len:
-            cur_len += l
-            lod.append(cur_len)
 
         self.inputs = {
             'X': self.x,
             'ROIs': (self.rois[:, 1:5], self.rois_lod),
-            'RoisLod': np.asarray(lod).astype('int64')
+            'RoisNum': np.asarray(seq_len).astype('int32')
         }
 
         self.attrs = {
diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
index 09cd40d9cc59914c82cc343bb78b72fbc2b29e59..1c11e831b0ad31a3c450c70e7f7c258455409d05 100644
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -227,14 +227,15 @@ class TestConvertSyncBatchNorm(unittest.TestCase):
             return
 
         with program_guard(Program(), Program()):
+            compare_model = paddle.nn.Sequential(
+                paddle.nn.Conv2d(3, 5, 3), paddle.nn.BatchNorm2d(5))
             model = paddle.nn.Sequential(
                 paddle.nn.Conv2d(3, 5, 3), paddle.nn.BatchNorm2d(5))
-            sync_model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)
-            for idx, sublayer in enumerate(model.sublayers()):
+            model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+            for idx, sublayer in enumerate(compare_model.sublayers()):
                 if isinstance(sublayer, paddle.nn.BatchNorm2d):
                     self.assertEqual(
-                        isinstance(sync_model[idx], paddle.nn.SyncBatchNorm),
-                        True)
+                        isinstance(model[idx], paddle.nn.SyncBatchNorm), True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_translated_layer.py b/python/paddle/fluid/tests/unittests/test_translated_layer.py
index 20c51b9afbafac9ba1fa032aea446383bc2b9796..e5dc279750d3d9605aeba1d27dbb84a35cf31921 100644
--- a/python/paddle/fluid/tests/unittests/test_translated_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_translated_layer.py
@@ -49,7 +49,10 @@ class LinearNet(nn.Layer):
         super(LinearNet, self).__init__()
         self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
 
-    @paddle.jit.to_static
+    @paddle.jit.to_static(input_spec=[
+        paddle.static.InputSpec(
+            shape=[None, IMAGE_SIZE], dtype='float32', name='x')
+    ])
     def forward(self, x):
         return self._linear(x)
 
@@ -152,6 +155,34 @@ class TestTranslatedLayer(unittest.TestCase):
         with self.assertRaises(ValueError):
             program = translated_layer.program('not_exists')
 
+    def test_get_input_spec(self):
+        # load
+        translated_layer = paddle.jit.load(self.model_path)
+
+        expect_spec = [
+            paddle.static.InputSpec(
+                shape=[None, IMAGE_SIZE], dtype='float32', name='x')
+        ]
+        actual_spec = translated_layer._input_spec()
+
+        for spec_x, spec_y in zip(expect_spec, actual_spec):
+            self.assertEqual(spec_x, spec_y)
+
+    def test_get_output_spec(self):
+        # load
+        translated_layer = paddle.jit.load(self.model_path)
+
+        expect_spec = [
+            paddle.static.InputSpec(
+                shape=[None, CLASS_NUM],
+                dtype='float32',
+                name='translated_layer/scale_0.tmp_1')
+        ]
+        actual_spec = translated_layer._output_spec()
+
+        for spec_x, spec_y in zip(expect_spec, actual_spec):
+            self.assertEqual(spec_x, spec_y)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tril_triu_op.py b/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
index aed265b21b5781d88da0380b04872061e893d736..2cd2599f2ea2f4fb26b2d2730ca45384a3b664a7 100644
--- a/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
@@ -142,6 +142,18 @@ class TestTrilTriuOpAPI(unittest.TestCase):
             self.assertTrue(np.allclose(tril_out, np.tril(data)))
             self.assertTrue(np.allclose(triu_out, np.triu(data)))
 
+    def test_fluid_api(self):
+        data = np.random.random([1, 9, 9, 4]).astype('float32')
+        x = fluid.data(shape=[1, 9, -1, 4], dtype='float32', name='x')
+        triu_out = fluid.layers.triu(x)
+
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        triu_out = exe.run(fluid.default_main_program(),
+                           feed={"x": data},
+                           fetch_list=[triu_out])
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py
index 49924b44441aa9ae323f0d7921d71bf58b8c2cf2..245c2623b869af30acfb5d0379c7597813645031 100755
--- a/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py
@@ -26,6 +26,9 @@ def trilinear_interp_np(input,
                         out_d,
                         out_h,
                         out_w,
+                        scale_d=0,
+                        scale_h=0,
+                        scale_w=0,
                         out_size=None,
                         actual_shape=None,
                         align_corners=True,
@@ -49,17 +52,26 @@ def trilinear_interp_np(input,
         if (align_corners):
             ratio_d = (in_d - 1.0) / (out_d - 1.0)
         else:
-            ratio_d = 1.0 * in_d / out_d
+            if scale_d > 0:
+                ratio_d = 1.0 / scale_d
+            else:
+                ratio_d = 1.0 * in_d / out_d
     if out_h > 1:
         if (align_corners):
             ratio_h = (in_h - 1.0) / (out_h - 1.0)
         else:
-            ratio_h = 1.0 * in_h / out_h
+            if scale_h > 0:
+                ratio_h = 1.0 / scale_h
+            else:
+                ratio_h = 1.0 * in_h / out_h
     if out_w > 1:
         if (align_corners):
             ratio_w = (in_w - 1.0) / (out_w - 1.0)
         else:
-            ratio_w = 1.0 * in_w / out_w
+            if scale_w > 0:
+                ratio_w = 1.0 / scale_w
+            else:
+                ratio_w = 1.0 * in_w / out_w
 
     out = np.zeros((batch_size, channel, out_d, out_h, out_w))
 
@@ -133,6 +145,9 @@ class TestTrilinearInterpOp(OpTest):
         self.op_type = "trilinear_interp_v2"
         input_np = np.random.random(self.input_shape).astype("float32")
 
+        scale_w = 0
+        scale_h = 0
+        scale_d = 0
         if self.data_layout == "NCDHW":
             in_d = self.input_shape[2]
             in_h = self.input_shape[3]
@@ -159,9 +174,10 @@ class TestTrilinearInterpOp(OpTest):
             out_h = self.out_h
             out_w = self.out_w
 
-        output_np = trilinear_interp_np(
-            input_np, out_d, out_h, out_w, self.out_size, self.actual_shape,
-            self.align_corners, self.align_mode, self.data_layout)
+        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w, scale_d,
+                                        scale_h, scale_w, self.out_size,
+                                        self.actual_shape, self.align_corners,
+                                        self.align_mode, self.data_layout)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -359,7 +375,7 @@ class TestTrilinearInterpOpUint8(OpTest):
             out_h = self.out_h
             out_w = self.out_w
 
-        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w,
+        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w, 0, 0, 0,
                                         self.out_size, self.actual_shape,
                                         self.align_corners, self.align_mode)
         self.inputs = {'X': input_np}
@@ -482,7 +498,7 @@ class TestTrilinearInterpZero(TestTrilinearInterpOp):
         self.out_d = 60
         self.out_h = 40
         self.out_w = 25
-        self.scale = 0.2
+        self.scale = 0.0
         self.align_corners = False
         self.align_mode = 0
 
@@ -541,7 +557,7 @@ class TestTrilinearInterpOp_attr_tensor(OpTest):
             if isinstance(self.scale, list) and len(self.scale) == 1:
                 self.scale = [self.scale[0], self.scale[0], self.scale[0]]
             self.attrs['scale'] = self.scale
-        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w,
+        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w, 0, 0, 0,
                                         self.out_size, self.actual_shape,
                                         self.align_corners, self.align_mode)
         self.outputs = {'Out': output_np}
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
index 9382d53e7fec6ba9e1217f99ba5006b3dfe5c150..1975e4306026ee459aa585c47afa74fce6a6aeed 100644
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -134,29 +134,61 @@ class API_TestUnsqueeze3(unittest.TestCase):
             result1, = exe.run(feed={"data1": input,
                                      "data2": input2},
                                fetch_list=[result_squeeze])
-            self.assertTrue(np.allclose(input1, result1))
+            self.assertTrue(np.array_equal(input1, result1))
+            self.assertEqual(input1.shape, result1.shape)
 
 
 class API_TestDyUnsqueeze(unittest.TestCase):
     def test_out(self):
         with fluid.dygraph.guard():
             input_1 = np.random.random([5, 1, 10]).astype("int32")
-            input1 = np.squeeze(input_1, axis=1)
+            input1 = np.expand_dims(input_1, axis=1)
             input = fluid.dygraph.to_variable(input_1)
             output = paddle.unsqueeze(input, axis=[1])
             out_np = output.numpy()
-            self.assertTrue(np.allclose(input1, out_np))
+            self.assertTrue(np.array_equal(input1, out_np))
+            self.assertEqual(input1.shape, out_np.shape)
 
 
 class API_TestDyUnsqueeze2(unittest.TestCase):
     def test_out(self):
         with fluid.dygraph.guard():
-            input_1 = np.random.random([5, 1, 10]).astype("int32")
-            input1 = np.squeeze(input_1, axis=1)
-            input = fluid.dygraph.to_variable(input_1)
+            input1 = np.random.random([5, 10]).astype("int32")
+            out1 = np.expand_dims(input1, axis=1)
+            input = fluid.dygraph.to_variable(input1)
             output = paddle.unsqueeze(input, axis=1)
             out_np = output.numpy()
-            self.assertTrue(np.allclose(input1, out_np))
+            self.assertTrue(np.array_equal(out1, out_np))
+            self.assertEqual(out1.shape, out_np.shape)
+
+
+class API_TestDyUnsqueezeAxisTensor(unittest.TestCase):
+    def test_out(self):
+        with fluid.dygraph.guard():
+            input1 = np.random.random([5, 10]).astype("int32")
+            out1 = np.expand_dims(input1, axis=1)
+            out1 = np.expand_dims(out1, axis=2)
+            input = fluid.dygraph.to_variable(input1)
+            output = paddle.unsqueeze(input, axis=paddle.to_tensor([1, 2]))
+            out_np = output.numpy()
+            self.assertTrue(np.array_equal(out1, out_np))
+            self.assertEqual(out1.shape, out_np.shape)
+
+
+class API_TestDyUnsqueezeAxisTensorList(unittest.TestCase):
+    def test_out(self):
+        with fluid.dygraph.guard():
+            input1 = np.random.random([5, 10]).astype("int32")
+            # Actually, expand_dims supports tuple since version 1.18.0
+            out1 = np.expand_dims(input1, axis=1)
+            out1 = np.expand_dims(out1, axis=2)
+            input = fluid.dygraph.to_variable(input1)
+            output = paddle.unsqueeze(
+                fluid.dygraph.to_variable(input1),
+                axis=[paddle.to_tensor([1]), paddle.to_tensor([2])])
+            out_np = output.numpy()
+            self.assertTrue(np.array_equal(out1, out_np))
+            self.assertEqual(out1.shape, out_np.shape)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb93334415c3046362090a143f6c15069793709a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py
@@ -0,0 +1,250 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid as fluid
+import paddle.fluid.contrib.mixed_precision.amp_nn as amp_nn
+
+
+class TestUpdateLossScalingOp(OpTest):
+    def setUp(self):
+        self.op_type = "update_loss_scaling"
+        self.init()
+        found_inf = np.array([False], dtype=np.bool)
+        x = np.random.random((1024, 1024)).astype(self.dtype)
+
+        self.inputs = {
+            'X': [('x0', x)],
+            'FoundInfinite': found_inf,
+            'PrevLossScaling': self.prev_loss_scaling,
+            'InGoodSteps': self.num_good_steps,
+            'InBadSteps': self.num_bad_steps
+        }
+
+        self.outputs = {
+            'Out': [('out0', np.zeros_like(x))],
+            'LossScaling': self.prev_loss_scaling * self.incr_ratio,
+            'OutGoodSteps': self.zero_steps,
+            'OutBadSteps': self.zero_steps
+        }
+
+    def init(self):
+        self.incr_ratio = 2.0
+        self.decr_ratio = 0.8
+        self.dtype = np.float32
+        self.prev_loss_scaling = np.array([2048]).astype(self.dtype)
+        self.num_good_steps = np.array([999], dtype=np.int32)
+        self.num_bad_steps = np.array([1], dtype=np.int32)
+        self.zero_steps = np.array([0], dtype=np.int32)
+        self.attrs = {
+            'incr_every_n_steps': 1000,
+            'decr_every_n_nan_or_inf': 2,
+            'incr_ratio': self.incr_ratio,
+            'decr_ratio': self.decr_ratio,
+        }
+
+    def test_check_output(self):
+        self.check_output(no_check_set=['Out'])
+
+
+class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp):
+    def setUp(self):
+        self.op_type = "update_loss_scaling"
+        self.init()
+        found_inf = np.array([True], dtype=np.bool)
+        x = np.random.random((1024, 1024)).astype(self.dtype)
+        i = np.random.randint(0, 1024, 1)
+        j = np.random.randint(0, 1024, 1)
+        x[i[0]][j[0]] = np.inf
+
+        self.inputs = {
+            'X': [('x0', x)],
+            'FoundInfinite': found_inf,
+            'PrevLossScaling': self.prev_loss_scaling,
+            'InGoodSteps': self.num_good_steps,
+            'InBadSteps': self.num_bad_steps
+        }
+
+        self.outputs = {
+            'Out': [('out0', np.zeros_like(x))],
+            'LossScaling': self.prev_loss_scaling * self.decr_ratio,
+            'OutGoodSteps': self.zero_steps,
+            'OutBadSteps': self.zero_steps
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestUpdateLossScalingLayer(unittest.TestCase):
+    def loss_scaling_check(self, use_cuda=True, scope=fluid.Scope()):
+        a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
+        b = fluid.data(name="b", shape=[512, 128], dtype='float32')
+        x = [a, b]
+        found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
+        prev_loss_scaling = fluid.data(
+            name="prev_loss_scaling", shape=[1], dtype='float32')
+        num_good_steps = fluid.data(
+            name="num_good_steps", shape=[1], dtype='int32')
+        num_bad_steps = fluid.data(
+            name="num_bad_steps", shape=[1], dtype='int32')
+
+        a_v = np.random.random([1024, 1024]).astype('float32')
+        b_v = np.random.random([512, 128]).astype('float32')
+        found_inf_v = np.array([False]).astype('bool')
+        prev_loss_scaling_v = np.array([2048]).astype('float32')
+        num_good_steps_v = np.array([999], dtype=np.int32)
+        num_bad_steps_v = np.array([1], dtype=np.int32)
+
+        incr_every_n_steps = 1000
+        decr_every_n_nan_or_inf = 2
+        incr_ratio = 2
+        decr_ratio = 0.8
+
+        result = amp_nn.update_loss_scaling(
+            x,
+            found_inf,
+            prev_loss_scaling,
+            num_good_steps,
+            num_bad_steps,
+            incr_every_n_steps,
+            decr_every_n_nan_or_inf,
+            incr_ratio,
+            decr_ratio,
+            name="update_loss_scaling")
+
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        with fluid.scope_guard(scope):
+            exe.run(fluid.default_startup_program())
+            result_v = exe.run(feed={
+                'a': a_v,
+                'b': b_v,
+                'found_inf': found_inf_v,
+                'prev_loss_scaling': prev_loss_scaling_v,
+                'num_good_steps': num_good_steps_v,
+                'num_bad_steps': num_bad_steps_v
+            },
+                               fetch_list=[
+                                   result, x, found_inf, prev_loss_scaling,
+                                   num_good_steps, num_bad_steps
+                               ])
+        assert np.array_equal(result_v[0], a_v)
+        assert np.array_equal(result_v[1], b_v)
+        assert np.array_equal(result_v[0], result_v[2])
+        assert np.array_equal(result_v[1], result_v[3])
+        assert np.array_equal(result_v[4], found_inf_v)
+        assert np.array_equal(result_v[5], prev_loss_scaling_v * incr_ratio)
+        assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
+        assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
+
+    def loss_scaling_check_inf(self, use_cuda=True, scope=fluid.Scope()):
+        a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
+        b = fluid.data(name="b", shape=[512, 128], dtype='float32')
+        x = [a, b]
+        found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
+        prev_loss_scaling = fluid.data(
+            name="prev_loss_scaling", shape=[1], dtype='float32')
+        num_good_steps = fluid.data(
+            name="num_good_steps", shape=[1], dtype='int32')
+        num_bad_steps = fluid.data(
+            name="num_bad_steps", shape=[1], dtype='int32')
+
+        a_v = np.random.random([1024, 1024]).astype('float32')
+        b_v = np.random.random([512, 128]).astype('float32')
+        i = np.random.randint(0, 1024, 1)
+        j = np.random.randint(0, 1024, 1)
+        a_v[i[0]][j[0]] = np.inf
+        found_inf_v = np.array([True]).astype('bool')
+        prev_loss_scaling_v = np.array([2048]).astype('float32')
+        num_good_steps_v = np.array([999], dtype=np.int32)
+        num_bad_steps_v = np.array([1], dtype=np.int32)
+
+        incr_every_n_steps = 1000
+        decr_every_n_nan_or_inf = 2
+        incr_ratio = 2
+        decr_ratio = 0.8
+
+        result = amp_nn.update_loss_scaling(
+            x,
+            found_inf,
+            prev_loss_scaling,
+            num_good_steps,
+            num_bad_steps,
+            incr_every_n_steps,
+            decr_every_n_nan_or_inf,
+            incr_ratio,
+            decr_ratio,
+            name="update_loss_scaling")
+
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        with fluid.scope_guard(scope):
+            exe.run(fluid.default_startup_program())
+            result_v = exe.run(feed={
+                'a': a_v,
+                'b': b_v,
+                'found_inf': found_inf_v,
+                'prev_loss_scaling': prev_loss_scaling_v,
+                'num_good_steps': num_good_steps_v,
+                'num_bad_steps': num_bad_steps_v
+            },
+                               fetch_list=[
+                                   result, x, found_inf, prev_loss_scaling,
+                                   num_good_steps, num_bad_steps
+                               ])
+        assert np.array_equal(result_v[0], np.zeros_like(a_v))
+        assert np.array_equal(result_v[1], np.zeros_like(b_v))
+        assert np.array_equal(result_v[2], np.zeros_like(a_v))
+        assert np.array_equal(result_v[3], np.zeros_like(b_v))
+        assert np.array_equal(result_v[4], found_inf_v)
+        assert np.array_equal(result_v[5], prev_loss_scaling_v * decr_ratio)
+        assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
+        assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
+
+    def test_loss_scaling_cpu(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                self.loss_scaling_check(use_cuda=False)
+
+    def test_loss_scaling_cpu_inf(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                self.loss_scaling_check_inf(use_cuda=False)
+
+    def test_loss_scaling_gpu(self):
+        if fluid.core.is_compiled_with_cuda():
+            main = fluid.Program()
+            startup = fluid.Program()
+            with fluid.unique_name.guard():
+                with fluid.program_guard(main, startup):
+                    self.loss_scaling_check(use_cuda=True)
+
+    def test_loss_scaling_gpu_inf(self):
+        if fluid.core.is_compiled_with_cuda():
+            main = fluid.Program()
+            startup = fluid.Program()
+            with fluid.unique_name.guard():
+                with fluid.program_guard(main, startup):
+                    self.loss_scaling_check_inf(use_cuda=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py b/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py
index 227e6cc28fb4a6d05d73cbf2c3c92bda623b7d58..e19641e710dda6cd2614a75a3ca4b2f7ec1c0b58 100644
--- a/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py
@@ -26,4 +26,5 @@ NEED_TO_FIX_OP_LIST = [
     'squared_l2_distance',
     'tree_conv',
     'cvm',
+    'cudnn_lstm',
 ]
diff --git a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
index 0de0eeb464ad700abb2144e49a822582b8653589..afd3414943e9c94799aba5e5e747182623b0a095 100644
--- a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
@@ -25,6 +25,7 @@ no_check_set_white_list = [
     'unsqueeze2',
     'cross_entropy2',
     'seed',
-    'amp_check_finite_and_scale',
+    'check_finite_and_unscale',
+    'update_loss_scaling',
     'cudnn_lstm',
 ]
diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
index 716be1b539809ea3f90885b512f51ac45d85cd37..d388ba62f2a244f84497810739e5fd6b50f669d2 100644
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
 import numpy as np
+import numbers
 
 import paddle
 import paddle.nn as nn
@@ -107,6 +109,11 @@ def summary(net, input_size, batch_size=None, dtypes=None):
     if batch_size is None:
         batch_size = -1
 
+    if not paddle.in_dynamic_mode():
+        warnings.warn(
+            "Your model was created in static mode, this may not get correct summary information!"
+        )
+
     result, params_info = summary_string(net, _input_size, batch_size, dtypes)
     print(result)
 
@@ -121,16 +128,16 @@ def summary_string(model, input_size, batch_size=-1, dtypes=None):
 
     depth = len(list(model.sublayers()))
 
-    def register_hook(module):
-        def hook(module, input, output):
-            class_name = str(module.__class__).split(".")[-1].split("'")[0]
+    def register_hook(layer):
+        def hook(layer, input, output):
+            class_name = str(layer.__class__).split(".")[-1].split("'")[0]
 
             try:
-                module_idx = int(module._full_name.split('_')[-1])
+                layer_idx = int(layer._full_name.split('_')[-1])
             except:
-                module_idx = len(summary)
+                layer_idx = len(summary)
 
-            m_key = "%s-%i" % (class_name, module_idx + 1)
+            m_key = "%s-%i" % (class_name, layer_idx + 1)
             summary[m_key] = OrderedDict()
             summary[m_key]["input_shape"] = list(input[0].shape)
             summary[m_key]["input_shape"][0] = batch_size
@@ -142,23 +149,50 @@ def summary_string(model, input_size, batch_size=-1, dtypes=None):
                 summary[m_key]["output_shape"][0] = batch_size
 
             params = 0
-            if hasattr(module, "weight") and hasattr(module.weight, "shape"):
-                params += np.prod(module.weight.shape)
-                summary[m_key]["trainable"] = module.weight.trainable or (
-                    not module.weight.stop_gradient)
-            if hasattr(module, "bias") and hasattr(module.bias, "shape"):
-                params += np.prod(module.bias.shape)
+
+            if paddle.in_dynamic_mode():
+                layer_state_dict = layer._parameters
+            else:
+                layer_state_dict = layer.state_dict()
+
+            for k, v in layer_state_dict.items():
+                params += np.prod(v.shape)
+
+                try:
+                    if (getattr(getattr(layer, k), 'trainable')) and (
+                            not getattr(getattr(layer, k), 'stop_gradient')):
+                        summary[m_key]["trainable"] = True
+                    else:
+                        summary[m_key]["trainable"] = False
+                except:
+                    summary[m_key]["trainable"] = True
+
             summary[m_key]["nb_params"] = params
 
-        if (not isinstance(module, nn.Sequential) and
-                not isinstance(module, nn.LayerList) and
-            (not (module == model) or depth < 1)):
+        if (not isinstance(layer, nn.Sequential) and
+                not isinstance(layer, nn.LayerList) and
+            (not (layer == model) or depth < 1)):
+
+            hooks.append(layer.register_forward_post_hook(hook))
+
+    def _check_input_size(input_sizes):
+        for input_size in input_sizes:
+            for item in input_size:
+                if not isinstance(item, numbers.Number):
+                    raise TypeError(
+                        "Expected item in input size be a number, but got {}".
+                        format(type(item)))
 
-            hooks.append(module.register_forward_post_hook(hook))
+                if item <= 0:
+                    raise ValueError(
+                        "Expected item in input size greater than zero, but got {}".
+                        format(item))
 
     if isinstance(input_size, tuple):
         input_size = [input_size]
 
+    _check_input_size(input_size)
+
     x = [
         paddle.rand(
             [2] + list(in_size), dtype=dtype)
@@ -197,7 +231,12 @@ def summary_string(model, input_size, batch_size=-1, dtypes=None):
             "{0:,}".format(summary[layer]["nb_params"]), )
         total_params += summary[layer]["nb_params"]
 
-        total_output += np.prod(summary[layer]["output_shape"])
+        try:
+            total_output += np.prod(summary[layer]["output_shape"])
+        except:
+            for output_shape in summary[layer]["output_shape"]:
+                total_output += np.prod(output_shape)
+
         if "trainable" in summary[layer]:
             if summary[layer]["trainable"] == True:
                 trainable_params += summary[layer]["nb_params"]
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 66caba540f2fed8c035d0f1af14f9e40a329bca5..79583f344f0c1f642586c4a8ecc08f2aa4e24008 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -89,7 +89,7 @@ from .layer.common import CosineSimilarity  #DEFINE_ALIAS
 from .layer.common import Embedding  #DEFINE_ALIAS
 from .layer.common import Linear  #DEFINE_ALIAS
 from .layer.common import Flatten  #DEFINE_ALIAS
-from .layer.common import UpSample  #DEFINE_ALIAS
+from .layer.common import Upsample  #DEFINE_ALIAS
 from .layer.common import UpsamplingNearest2d  #DEFINE_ALIAS
 from .layer.common import UpsamplingBilinear2d  #DEFINE_ALIAS
 from .layer.common import Bilinear  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 325eaa64d5ca4bd3d65bf266ff0a42226a3199e6..163c249ab37457d7d4566553c71e3231f384a8b1 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -39,7 +39,7 @@ from .activation import hard_sigmoid  #DEFINE_ALIAS
 from .activation import hard_swish  #DEFINE_ALIAS
 from .activation import hsigmoid  #DEFINE_ALIAS
 from .activation import leaky_relu  #DEFINE_ALIAS
-from .activation import logsigmoid  #DEFINE_ALIAS
+from .activation import log_sigmoid  #DEFINE_ALIAS
 from .activation import maxout  #DEFINE_ALIAS
 from .activation import prelu  #DEFINE_ALIAS
 from .activation import relu  #DEFINE_ALIAS
@@ -72,6 +72,7 @@ from .common import unfold  #DEFINE_ALIAS
 # from .common import bilinear_tensor_product        #DEFINE_ALIAS
 from .common import assign  #DEFINE_ALIAS
 from .common import interpolate  #DEFINE_ALIAS
+from .common import upsample  #DEFINE_ALIAS
 from .common import bilinear  #DEFINE_ALIAS
 from .conv import conv1d  #DEFINE_ALIAS
 from .conv import conv_transpose1d  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index ffedb027330bda94db86dc0943a5c4a7281f254f..f7bbe0c94e03dc48ebfb21a62aeded9f446afc63 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -35,7 +35,7 @@ __all__ = [
     'hard_swish',
     'hsigmoid',
     'leaky_relu',
-    'logsigmoid',
+    'log_sigmoid',
     'maxout',
     'prelu',
     'relu',
@@ -552,13 +552,13 @@ def relu(x, name=None):
     return out
 
 
-def logsigmoid(x, name=None):
+def log_sigmoid(x, name=None):
     """
-    logsigmoid activation.
+    log_sigmoid activation.
 
     .. math::
 
-        logsigmoid(x) = log \\frac{1}{1 + e^{-x}}
+        log\\_sigmoid(x) = log \\frac{1}{1 + e^{-x}}
     
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -573,20 +573,19 @@ def logsigmoid(x, name=None):
 
             import paddle
             import paddle.nn.functional as F
-            import numpy as np
 
             paddle.disable_static()
 
-            x = paddle.to_tensor(np.array([1.0, 2.0, 3.0, 4.0]))
-            out = F.logsigmoid(x) # [-0.313262 -0.126928 -0.0485874 -0.0181499]
+            x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
+            out = F.log_sigmoid(x) # [-0.313262 -0.126928 -0.0485874 -0.0181499]
     """
 
     if in_dygraph_mode():
         return core.ops.logsigmoid(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'logsigmoid')
-    helper = LayerHelper("logsigmoid", **locals())
+                             'log_sigmoid')
+    helper = LayerHelper("log_sigmoid", **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     helper.append_op(type='logsigmoid', inputs={'X': x}, outputs={'Out': out})
     return out
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index ad84a32186e8baeabbe8eea7d14e2b7391332944..9f7fb0185133f580deba64634b62d82955670641 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -80,6 +80,8 @@ def interpolate(x,
     The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
     or 4-D (num_batches, channels, in_h, in_w), or a 5-D Tensor of the shape
     (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels),
+    Where in_w is width of the input tensor, in_h is the height of the input tensor,
+    in_d is the depth of the intput tensor.
     and the resizing only applies on the three dimensions(depth, height and width).
 
     Supporting resample methods:
@@ -88,6 +90,7 @@ def interpolate(x,
         'trilinear' : Trilinear interpolation
         'nearest' : Nearest neighbor interpolation
         'bicubic' : Bicubic interpolation
+        'area': Area interpolation
 
     Linear interpolation is the method of using a line connecting two known quantities 
     to determine the value of an unknown quantity between the two known quantities. 
@@ -114,6 +117,12 @@ def interpolate(x,
     smoother than corresponding surfaces obtained by bilinear interpolation or
     nearest-neighbor interpolation.
 
+    Area interpolation is to perform area interpolation
+    in both the 3rd dimension(in height direction) , the 4th dimension(in width
+    direction) and the 5th dimension(in depth direction) on input tensor. Set to 
+    area will directly call `paddle.nn.functional.adaptive_avg_pool1d` or 
+    `paddle.nn.functional.adaptive_avg_pool2d` or `paddle.nn.functional.adaptive_avg_pool3d`.
+
     Example:
 
     .. code-block:: text
@@ -207,11 +216,11 @@ def interpolate(x,
              when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
              Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
              If a Tensor Variable, its dimensions size should be a 1.
-        scale_factor (float|Tensor|list|None): The multiplier for the input height or width. At
-             least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
-             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.Has to match input size if it is a list.
+        scale_factor (float|Tensor|list|tuple|None): The multiplier for the input height or width. At
+             least one of :attr:`size` or :attr:`scale_factor` must be set.
+             And :attr:`size` has a higher priority than :attr:`scale_factor`.Has to match input size if it is either a list or a tuple or a Tensor.
              Default: None.
-        mode (str): The resample method. It supports 'linear', 'nearest', 'bilinear',
+        mode (str): The resample method. It supports 'linear', 'area', 'nearest', 'bilinear',
                        'bicubic' and 'trilinear' currently. Default: 'nearest'
         align_corners(bool) :  An optional bool, If True, the centers of the 4 corner pixels of the
                                input and output tensors are aligned, preserving the values at the
@@ -235,7 +244,7 @@ def interpolate(x,
     Raises:
         TypeError: size should be a list or tuple or Tensor.
         ValueError: The 'mode' of image_resize can only be 'linear', 'bilinear',
-                    'trilinear', 'bicubic', or 'nearest' currently.
+                    'trilinear', 'bicubic', 'area' or 'nearest' currently.
         ValueError: 'linear' only support 3-D tensor.
         ValueError: 'bilinear', 'bicubic' and 'nearest' only support 4-D tensor.
         ValueError: 'trilinear' only support 5-D tensor.
@@ -283,10 +292,11 @@ def interpolate(x,
         'TRILINEAR',
         'NEAREST',
         'BICUBIC',
+        'AREA',
     ]
     if resample not in resample_methods:
         raise ValueError(
-            "The 'resample' of image_resize can only be 'linaer', 'bilinear', 'trilinear', "
+            "The 'resample' of image_resize can only be 'area', 'linear', 'bilinear', 'trilinear', "
             " 'bicubic' or 'nearest' currently.")
 
     if resample in ['LINEAR'] and len(x.shape) != 3:
@@ -310,8 +320,17 @@ def interpolate(x,
         raise ValueError(
             "align_corners option can only be set with the interpolating modes: linear | bilinear | bicubic | trilinear"
         )
+
+    if resample == 'AREA' and len(x.shape) == 3:
+        return paddle.nn.functional.adaptive_avg_pool1d(x, size)
+
+    if resample == 'AREA' and len(x.shape) == 4:
+        return paddle.nn.functional.adaptive_avg_pool2d(x, size)
+    if resample == 'AREA' and len(x.shape) == 5:
+        return paddle.nn.functional.adaptive_avg_pool3d(x, size)
+
     helper = LayerHelper('{}_interp_v2'.format(resample_type), **locals())
-    dtype = helper.input_dtype()
+    dtype = helper.input_dtype(input_param_name='x')
     if len(x.shape) == 3 and data_format not in ['NCW', 'NWC']:
         raise ValueError(
             "Got wrong value for param `data_format`: " + data_format +
@@ -349,14 +368,15 @@ def interpolate(x,
 
     out_shape = size
     scale = scale_factor
+    if out_shape is not None and scale is not None:
+        raise ValueError("Only one of size or scale_factor should be defined.")
     if out_shape is not None:
         if isinstance(out_shape, Variable):
             out_shape.stop_gradient = True
             inputs['OutSize'] = out_shape
         else:
             if not (_is_list_or_turple_(out_shape)):
-                raise TypeError(
-                    "out_shape should be a list or tuple or Variable.")
+                raise TypeError("size should be a list or tuple or Variable.")
             # Validate the shape
             contain_var = False
             for dim_idx, dim_size in enumerate(out_shape):
@@ -388,7 +408,7 @@ def interpolate(x,
             if len(x.shape) == 3:
                 if len(out_shape) != 1:
                     raise ValueError(
-                        "out_shape length should be 2 for input 3-D tensor")
+                        "size length should be 2 for input 3-D tensor")
                 if contain_var:
                     attrs['out_w'] = size_list[0]
                 else:
@@ -396,7 +416,7 @@ def interpolate(x,
                     attrs['out_w'] = out_shape[0]
             if len(x.shape) == 4:
                 if len(out_shape) != 2:
-                    raise ValueError("out_shape length should be 2 for "
+                    raise ValueError("size length should be 2 for "
                                      "input 4-D tensor.")
                 if contain_var:
                     attrs['out_h'] = size_list[0]
@@ -407,7 +427,7 @@ def interpolate(x,
                     attrs['out_w'] = out_shape[1]
             if len(x.shape) == 5:
                 if len(out_shape) != 3:
-                    raise ValueError("out_shape length should be 3 for "
+                    raise ValueError("size length should be 3 for "
                                      "input 5-D tensor.")
                 if contain_var:
                     attrs['out_d'] = size_list[0]
@@ -430,7 +450,7 @@ def interpolate(x,
             for i in range(len(x.shape) - 2):
                 scale_list.append(scale)
             attrs['scale'] = list(map(float, scale_list))
-        elif isinstance(scale, list):
+        elif isinstance(scale, list) or isinstance(scale, tuple):
             if len(scale) != len(x.shape) - 2:
                 raise ValueError("scale_shape length should be {} for "
                                  "input {}-D tensor.".format(
@@ -441,7 +461,8 @@ def interpolate(x,
             attrs['scale'] = list(map(float, scale))
         else:
             raise TypeError(
-                "Attr(scale)'s type should be float, int, list or Tensor.")
+                "Attr(scale)'s type should be float, int, list, tuple, or Tensor."
+            )
 
     if in_dygraph_mode():
         attr_list = []
@@ -480,9 +501,12 @@ def upsample(x,
              name=None):
     """
     This op resizes a batch of images.
+
     The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
     or 4-D (num_batches, channels, in_h, in_w), or a 5-D Tensor of the shape
     (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels),
+    Where in_w is width of the input tensor, in_h is the height of the input tensor,
+    in_d is the depth of the intput tensor.
     and the resizing only applies on the three dimensions(depth, height and width).
 
     Supporting resample methods:
@@ -507,12 +531,21 @@ def upsample(x,
     data points on a two-dimensional regular grid. The interpolated surface is
     smoother than corresponding surfaces obtained by bilinear interpolation or
     nearest-neighbor interpolation.
+
     Trilinear interpolation is an extension of linear interpolation for
     interpolating functions of three variables (e.g. D-direction,
     H-direction and W-direction in this op) on a rectilinear 3D grid.
+
     The linear interpolation is performed on three directions.
     align_corners and align_mode are optional parameters,the calculation method
     of interpolation can be selected by them.
+
+    Area interpolation is to perform area interpolation
+    in both the 3rd dimension(in height direction) , the 4th dimension(in width
+    direction) and the 5th dimension(in depth direction) on input tensor. Set to
+    area will directly call `paddle.nn.functional.adaptive_avg_pool1d` or
+    `paddle.nn.functional.adaptive_avg_pool2d` or `paddle.nn.functional.adaptive_avg_pool3d`.
+
     Example:
     .. code-block:: text
         For scale_factor:
@@ -605,9 +638,10 @@ def upsample(x,
              when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
              Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
              If a Tensor Variable, its dimensions size should be a 1.
-        scale_factor (float|Tensor|list|None): The multiplier for the input height or width. At
-             least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
-             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.
+        scale_factor (float|Tensor|list|tuple|None): The multiplier for the input height or width. At
+             least one of :attr:`size` or :attr:`scale_factor` must be set.
+             And :attr:`size` has a higher priority than :attr:`scale_factor`.Has to match input size if 
+             it is either a list or a tuple or a Tensor.
              Default: None.
         mode (str): The resample method. It supports 'linear', 'nearest', 'bilinear',
                        'bicubic' and 'trilinear' currently. Default: 'nearest'
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 3c1482e69c3c36232ee5d70f2156a8d16c2d212a..5cf4953933242292c6a732513dbee2164811dd35 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -267,8 +267,8 @@ def conv1d(x,
     dilation = utils.convert_to_list(dilation, 1, 'dilation') + [1]
 
     l_type = "conv2d"
-    if (num_channels == groups and num_filters % num_channels == 0 and
-            not use_cudnn):
+    if (num_channels == groups and num_channels != 1 and
+            num_filters % num_channels == 0 and not use_cudnn):
         l_type = 'depthwise_conv2d'
         use_cudnn = False
 
@@ -491,7 +491,8 @@ def conv2d(x,
     dilation = utils.convert_to_list(dilation, 2, 'dilation')
 
     l_type = "conv2d"
-    if (num_channels == groups and num_filters % num_channels == 0):
+    if (num_channels == groups and num_channels != 1 and
+            num_filters % num_channels == 0):
         l_type = 'depthwise_conv2d'
         use_cudnn = False
 
@@ -761,7 +762,8 @@ def conv_transpose1d(x,
 
     op_type = 'conv2d_transpose'
     num_filters = weight.shape[1]
-    if (num_channels == groups and num_filters == 1 and not use_cudnn):
+    if (num_channels == groups and num_channels != 1 and num_filters == 1 and
+            not use_cudnn):
         op_type = 'depthwise_conv2d_transpose'
         use_cudnn = False
 
@@ -1010,7 +1012,7 @@ def conv_transpose2d(x,
 
     op_type = 'conv2d_transpose'
     num_filters = weight.shape[1]
-    if (num_channels == groups and num_filters == 1):
+    if (num_channels == groups and num_channels != 1 and num_filters == 1):
         op_type = 'depthwise_conv2d_transpose'
         use_cudnn = False
 
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index bc48cc21c29e6683602f37fb3eab6c9485fe4977..0794b95c801011da6845eaf82c32a5428e0d5f41 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -113,17 +113,18 @@ def one_hot(x, num_classes, name=None):
 
 def embedding(x, weight, padding_idx=None, sparse=False, name=None):
     """
-    The operator is used to lookup embeddings vector of ids provided by :attr:`input` .
+    The operator is used to lookup embeddings vector of ids provided by :attr:`x` .
 
     The shape of output Tensor is generated by appending the last dimension of the input Tensor shape
     with embedding size.
-    **Note:** The id in :attr:`input` must satisfy :math:`0 =< id < weight.shape[0]` ,
+
+    **Note:** The id in :attr:`x` must satisfy :math:`0 =< id < weight.shape[0]` ,
     otherwise the program will throw an exception and exit.
 
     .. code-block:: text
 
         Case 1:
-            input is a Tensor. 
+            x is a Tensor.
                 padding_idx = -1
                 x.data = [[1, 3], [2, 4], [4, 127]]
                 x.shape = [3, 2]
@@ -138,7 +139,7 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
                             [0.0,         0.0,         ..., 0.0        ]]]  # padding data
 
             The input padding_idx is less than 0, it is automatically converted to padding_idx = -1 + 128 = 127
-            It will pad all-zero data when ids is 127.
+            It will pad all-zero data when id is 127.
 
     Args:
         x(Tensor): A Tensor with type int32/int64, which contains the id information. The value of the input id should
@@ -151,10 +152,10 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
             such as :ref:`api_optimizer_AdadeltaOptimizer` , :ref:`api_optimizer_AdamaxOptimizer` ,
             :ref:`api_optimizer_DecayedAdagradOptimizer` , :ref:`api_optimizer_FtrlOptimizer` ,
             :ref:`api_optimizer_LambOptimizer` and :ref:`api_optimizer_LarsMomentumOptimizer` .
-            In these cases, is_sparse must be False. Default: False.
-        padding_idx(int|long|None): padding_idx needs to be in the interval [-vocab_size, vocab_size).
+            In these cases, sparse must be False. Default: False.
+        padding_idx(int|long|None): padding_idx needs to be in the interval [-weight.shape[0], weight.shape[0]).
             If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
-            to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup
+            to :math:`weight.shape[0] + padding\_idx` . It will output all-zero padding data whenever lookup
             encounters :math:`padding\_idx` in id. And the padding data will not be updated while training.
             If set None, it makes no effect to output. Default: None.
         name(str|None): For detailed information, please refer
@@ -162,7 +163,7 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
            None by default.
 
     Returns:
-        Tensor: Embedding Tensor  mapped by input. The data type is the same as :attr:`weight`.
+        Tensor: Embedding Tensor  mapped by x. The data type is the same as :attr:`weight`.
 
     Examples:
 
@@ -209,6 +210,10 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
         padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
             weight.shape[0] + padding_idx)
 
+        if padding_idx >= weight.shape[0] or padding_idx < -weight.shape[0]:
+            raise ValueError("padding_idx must be within [-{}, {})".format(
+                weight.shape[0], weight.shape[0]))
+
         helper.append_op(
             type='lookup_table_v2',
             inputs={'Ids': x,
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 3d5894064c44cb72259472fc638d46b67c5703fc..da086c0955e849619ccbce17a297ca4615a3f3d0 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -780,10 +780,10 @@ def kl_div(input, label, reduction='mean', name=None):
             input = np.random.uniform(-10, 10, shape).astype('float32')
             target = np.random.uniform(-10, 10, shape).astype('float32')
 
-            # 'batchmean' reduction, loss shape will be [N]
+            # 'batchmean' reduction, loss shape will be [1]
             pred_loss = F.kl_div(paddle.to_tensor(input),
                                  paddle.to_tensor(target), reduction='batchmean')
-            # shape=[5]
+            # shape=[1]
 
             # 'mean' reduction, loss shape will be [1]
             pred_loss = F.kl_div(paddle.to_tensor(input),
@@ -1009,8 +1009,7 @@ def ctc_loss(log_probs,
     loss_out = fluid.layers.squeeze(loss_out, [-1])
     assert reduction in ['mean', 'sum', 'none']
     if reduction == 'mean':
-        loss_out = paddle.mean(loss_out / paddle.cast(label_lengths,
-                                                      loss_out.dtype))
+        loss_out = paddle.mean(loss_out / label_lengths)
     elif reduction == 'sum':
         loss_out = paddle.sum(loss_out)
     return loss_out
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 662205ab69550255406ff5edfda4556b73b98843..042625a3dbd6b07487d6f77442621959f7492af6 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -389,7 +389,7 @@ def avg_pool3d(x,
                stride=None,
                padding=0,
                ceil_mode=False,
-               count_include_pad=False,
+               count_include_pad=True,
                divisor_override=None,
                data_format="NCDHW",
                name=None):
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 1dfdac26e990851ac5f192742acd47fb92633d0d..a74a98d5ed45b9f613b0f2f6d5f04544ffae3d2a 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -249,7 +249,7 @@ def grid_sample(x,
         mode(str, optional): The interpolation method which can be 'bilinear' or 'nearest'.
                          Default: 'bilinear'.
         padding_mode(str, optional) The padding method used when source index
-                   is out of input images. It can be 'zeros', 'reflect' and 'border'.
+                   is out of input images. It can be 'zeros', 'reflection' and 'border'.
                    Default: zeros.
         align_corners(bool, optional): If `align_corners` is true, it will projects
                    -1 and 1 to the centers of the corner pixels. Otherwise, it will
@@ -312,7 +312,7 @@ def grid_sample(x,
     if not isinstance(grid, Variable):
         raise ValueError("The grid should be a Variable")
     _modes = ['bilinear', 'nearest']
-    _padding_modes = ['zeros', 'reflect', 'border']
+    _padding_modes = ['zeros', 'reflection', 'border']
     if mode not in _modes:
         raise ValueError(
             "The mode of grid sample function should be in {}, but got: {}".
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 7d7a392ebe80c3af8c991dbff746d0f8f216b18b..760af09f1f2f5af066058572f681ec21f9a93180 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -59,7 +59,7 @@ from .common import CosineSimilarity  #DEFINE_ALIAS
 from .common import Embedding  #DEFINE_ALIAS
 from .common import Linear  #DEFINE_ALIAS
 from .common import Flatten  #DEFINE_ALIAS
-from .common import UpSample  #DEFINE_ALIAS
+from .common import Upsample  #DEFINE_ALIAS
 from .common import UpsamplingNearest2d  #DEFINE_ALIAS
 from .common import UpsamplingBilinear2d  #DEFINE_ALIAS
 from .common import Dropout  #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index c38d6018a2500111280a482aa60d072e65e27742..585d369c607e5b6eb6a2a3bcb28bd8999a2e0dca 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -860,11 +860,10 @@ class LogSigmoid(layers.Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
-            x = paddle.to_tensor(np.array([1.0, 2.0, 3.0, 4.0]))
+            x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
             m = paddle.nn.LogSigmoid()
             out = m(x) # [-0.313262 -0.126928 -0.0485874 -0.0181499]
     """
@@ -874,7 +873,7 @@ class LogSigmoid(layers.Layer):
         self._name = name
 
     def forward(self, x):
-        return F.logsigmoid(x, self._name)
+        return F.log_sigmoid(x, self._name)
 
 
 class Softmax(layers.Layer):
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index d8e1d03b02840e76ff865986d8b90ca9d6cdd9f8..433443fee1765a3ecd4cf0bbe53a960bbeaefc71 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -26,7 +26,7 @@ __all__ = [
     'Pool2D',
     'Embedding',
     'Linear',
-    'UpSample',
+    'Upsample',
     'Pad2D',
     'UpsamplingNearest2d',
     'UpsamplingBilinear2d',
@@ -131,12 +131,15 @@ class Linear(layers.Layer):
         return out
 
 
-class UpSample(layers.Layer):
+class Upsample(layers.Layer):
     """
     This op resizes a batch of images.
+
     The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
     or 4-D (num_batches, channels, in_h, in_w), or a 5-D Tensor of the shape
     (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels),
+    Where in_w is width of the input tensor, in_h is the height of the input tensor,
+    in_d is the depth of the intput tensor.
     and the resizing only applies on the three dimensions(depth, height and width).
 
     Supporting resample methods:
@@ -171,6 +174,12 @@ class UpSample(layers.Layer):
     align_corners and align_mode are optional parameters,the calculation method
     of interpolation can be selected by them.
 
+    Area interpolation is to perform area interpolation
+    in both the 3rd dimension(in height direction) , the 4th dimension(in width
+    direction) and the 5th dimension(in depth direction) on input tensor. Set to
+    area will directly call `paddle.nn.functional.adaptive_avg_pool1d` or
+    `paddle.nn.functional.adaptive_avg_pool2d` or `paddle.nn.functional.adaptive_avg_pool3d`.
+
     Example:
 
     .. code-block:: text
@@ -273,9 +282,9 @@ class UpSample(layers.Layer):
              when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
              Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
              If a Tensor Variable, its dimensions size should be a 1.
-        scale_factor (float|Tensor|list|None): The multiplier for the input height or width. At
-             least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
-             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.Has to match input size if it is a list.
+        scale_factor (float|Tensor|list|tuple|None): The multiplier for the input height or width. At
+             least one of :attr:`size` or :attr:`scale_factor` must be set.
+             And :attr:`size` has a higher priority than :attr:`scale_factor`. Has to match input size if it is either a list or a tuple or a Tensor.
              Default: None.
         mode (str): The resample method. It supports 'linear', 'nearst', 'bilinear',
                        'bicubic' and 'trilinear' currently. Default: 'nearest'
@@ -322,7 +331,7 @@ class UpSample(layers.Layer):
             paddle.disable_static()
 
             input_data = np.random.rand(2,3,6,10).astype("float32")
-            upsample_out  = paddle.nn.UpSample(size=[12,12])
+            upsample_out  = paddle.nn.Upsample(size=[12,12])
 
             input = paddle.to_tensor(input_data)
             output = upsample_out(x=input)
@@ -339,7 +348,7 @@ class UpSample(layers.Layer):
                  align_mode=0,
                  data_format='NCHW',
                  name=None):
-        super(UpSample, self).__init__()
+        super(Upsample, self).__init__()
         self.size = size
         self.scale_factor = scale_factor
         self.mode = mode.lower()
@@ -366,7 +375,8 @@ class UpsamplingNearest2d(layers.Layer):
     """
     This op upsamples a batch of images, using nearest neighbours' pixel values.
     The input must be a 4-D Tensor of the shape (num_batches, channels, in_h, in_w), 
-    and the upsampling only applies on the two dimensions(height and width).
+    where in_w is width of the input tensor, in_h is the height of the input tensor.
+    And the upsampling only applies on the two dimensions(height and width).
 
     Nearest neighbor interpolation is to perform nearest neighbor interpolation
     in both the 3rd dimension(in height direction) and the 4th dimension(in width
@@ -381,10 +391,11 @@ class UpsamplingNearest2d(layers.Layer):
              layer, the shape is (out_h, out_w) when input is a 4-D Tensor. 
              Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
              If a Tensor Variable, its dimensions size should be a 1.
-        scale_factor (float|int|list|Tensor|None): The multiplier for the input height or width. At
-             least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
-             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.
-             Default: None. Has to match input size if it is a list.
+        scale_factor (float|int|list|tuple|Tensor|None): The multiplier for the input height or width. At
+             least one of :attr:`size` or :attr:`scale_factor` must be set.
+             And :attr:`size` has a higher priority than :attr:`scale_factor`.
+             Has to match input size if it is either a list or a tuple or a Tensor.
+             Default: None.
         data_format (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from:`NCW`, `NWC`, `"NCHW"`, `"NHWC"`, `"NCDHW"`,
             `"NDHWC"`. The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
@@ -449,7 +460,8 @@ class UpsamplingBilinear2d(layers.Layer):
     """
     This op upsamples a batch of images, using bilinear' pixel values.
     The input must be a 4-D Tensor of the shape (num_batches, channels, in_h, in_w), 
-    and the upsampling only applies on the two dimensions(height and width).
+    where in_w is width of the input tensor, in_h is the height of the input tensor.
+    And the upsampling only applies on the two dimensions(height and width).
 
     Bilinear interpolation is an extension of linear interpolation for
     interpolating functions of two variables (e.g. H-direction and
@@ -466,10 +478,11 @@ class UpsamplingBilinear2d(layers.Layer):
              layer, the shape is (out_h, out_w) when input is a 4-D Tensor. 
              Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
              If a Tensor Variable, its dimensions size should be a 1.
-        scale_factor (float|int|list|Tensor|None): The multiplier for the input height or width. At
-             least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
-             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.
-             Default: None. Has to match input size if it is a list.
+        scale_factor (float|int|list|tuple|Tensor|None): The multiplier for the input height or width. At
+             least one of :attr:`size` or :attr:`scale_factor` must be set.
+             And :attr:`size` has a higher priority than :attr:`scale_factor`.
+             Has to match input size if it is either a list or a tuple or a Tensor.
+             Default: None.
         data_format (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from:`NCW`, `NWC`, `"NCHW"`, `"NHWC"`, `"NCDHW"`,
             `"NDHWC"`. The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
@@ -1551,22 +1564,18 @@ class CosineSimilarity(layers.Layer):
 
 class Embedding(layers.Layer):
     """
-    :alias_main: paddle.nn.Embedding
-	:alias: paddle.nn.Embedding,paddle.nn.layer.Embedding,paddle.nn.layer.common.Embedding
-	:old_api: paddle.fluid.dygraph.Embedding
-
     **Embedding Layer**
 
     This interface is used to construct a callable object of the ``Embedding`` class.
     For specific usage, refer to code examples. It implements the function of the Embedding Layer.
-    This layer is used to lookup embeddings vector of ids provided by :attr:`input` .
+    This layer is used to lookup embeddings vector of ids provided by :attr:`x` .
     It automatically constructs a 2D embedding matrix based on the
-    input :attr:`size` (vocab_size, emb_size) and :attr:`dtype` .
+    input :attr:`num_embeddings` and attr:`embedding_dim`.
 
     The shape of output Tensor is generated by appending an emb_size dimension to the
     last dimension of the input Tensor shape.
 
-    **Note:** The id in :attr:`input` must satisfy :math:`0 =< id < size[0]` ,
+    **Note:** The id in :attr:`x` must satisfy :math:`0 =< id < num_embeddings` ,
     otherwise the program will throw an exception and exit.
 
     .. code-block:: text
@@ -1594,7 +1603,7 @@ class Embedding(layers.Layer):
         num_embeddings (int): Just one element which indicate the size
             of the dictionary of embeddings.
         embedding_dim:  Just one element which indicate the size of each embedding vector respectively.
-        padding_idx(int|long|None): padding_idx needs to be in the interval [-vocab_size, vocab_size).
+        padding_idx(int|long|None): padding_idx needs to be in the interval [-num_embeddings, num_embeddings).
             If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
             to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup
             encounters :math:`padding\_idx` in id. And the padding data will not be updated while training.
@@ -1605,13 +1614,13 @@ class Embedding(layers.Layer):
             such as :ref:`api_optimizer_AdadeltaOptimizer` , :ref:`api_optimizer_AdamaxOptimizer` ,
             :ref:`api_optimizer_DecayedAdagradOptimizer` , :ref:`api_optimizer_FtrlOptimizer` ,
             :ref:`api_optimizer_LambOptimizer` and :ref:`api_optimizer_LarsMomentumOptimizer` .
-            In these case, is_sparse must be False. Default: False.
+            In these case, sparse must be False. Default: False.
         weight_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the
-            default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` . In addition,
+            default weight parameter property is used. See usage for details in :ref:`api_ParamAttr` . In addition,
             user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter.
             The local word vector needs to be transformed into numpy format, and the shape of local word
-            vector should be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer`
-            is used to load custom or pre-trained word vectors. See code example 2 for details.
+            vector should be consistent with :attr:`num_embeddings` . Then :ref:`api_initializer_NumpyArrayInitializer`
+            is used to load custom or pre-trained word vectors. See code example for details.
         name(str|None): For detailed information, please refer
                to :ref:`api_guide_Name`. Usually name is no need to set and
                None by default.
@@ -1626,20 +1635,34 @@ class Embedding(layers.Layer):
 
         .. code-block:: python
 
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
-          paddle.disable_static()
+            import paddle
+            import numpy as np
+
+            x_data = np.arange(3, 6).reshape((3, 1)).astype(np.int64)
+            y_data = np.arange(6, 12).reshape((3, 2)).astype(np.float32)
+            paddle.disable_static(paddle.CPUPlace())
+            x = paddle.to_tensor(x_data, stop_gradient=False)
+            y = paddle.to_tensor(y_data, stop_gradient=False)
 
-          # example 1
-          inp_word = np.array([[2, 3, 5], [4, 2, 1]]).astype('int64')
-          inp_word.shape  # [2, 3]
-          dict_size = 20
+            embedding = paddle.nn.Embedding(10, 3, sparse=True)
+
+            w0=np.full(shape=(10, 3), fill_value=2).astype(np.float32)
+            embedding.weight.set_value(w0)
+
+            adam = paddle.optimizer.Adam(parameters=[embedding.weight], learning_rate=0.01)
+            adam.clear_grad()
+
+            # weight.shape = [10, 3]
+
+            # x.data = [[3],[4],[5]]
+            # x.shape = [3, 1]
+
+            # out.data = [[2,2,2], [2,2,2], [2,2,2]]
+            # out.shape = [3, 1, 3]
+            out=embedding(x)
+            out.backward()
+            adam.step()
 
-          emb = nn.Embedding(
-                    dict_size,
-                    32,
-                    sparse=False)
     """
 
     def __init__(self,
@@ -1656,13 +1679,24 @@ class Embedding(layers.Layer):
         self._is_distributed = False
         self._padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
             num_embeddings + padding_idx)
+
+        if self._num_embeddings <= 0:
+            raise ValueError("num_embeddings must be gather than 0")
+
+        if self._embedding_dim <= 0:
+            raise ValueError("embedding_dim must be gather than 0")
+
+        if self._padding_idx >= num_embeddings or self._padding_idx < -num_embeddings:
+            raise ValueError("padding_idx must be within [-{}, {})".format(
+                num_embeddings, num_embeddings))
+
         self._dtype = self._helper.get_default_dtype()
         self._size = [self._num_embeddings, self._embedding_dim]
 
         self._weight_attr = weight_attr
         self._remote_prefetch = False
         self._name = name
-        self._weight = self.create_parameter(
+        self.weight = self.create_parameter(
             attr=self._weight_attr,
             shape=self._size,
             dtype=self._dtype,
@@ -1671,7 +1705,7 @@ class Embedding(layers.Layer):
     def forward(self, x):
         return F.embedding(
             x,
-            weight=self._weight,
+            weight=self.weight,
             padding_idx=self._padding_idx,
             sparse=self._sparse,
             name=self._name)
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index a60e615d5064bf4ef2229dd67193774030383888..271dc9b4e685ce06cdb12ccdcb6bb0704a5ef2a1 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -627,10 +627,13 @@ class KLDivLoss(fluid.dygraph.Layer):
     $$l(x, y) = y * (\log(y) - x)$$
 
     Parameters:
-        reduction (str, optional): Indicate how to average the loss,
-            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
-            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
-            Default is ``'mean'``.
+        reduction (Tensor): Indicate how to average the loss,
+             the candicates are ``'none'`` | ``'batchmean'`` | ``'mean'`` | ``'sum'``.
+             If `reduction` is ``'mean'``, the reduced mean loss is returned;
+             If `reduction` is ``'batchmean'``, the sum loss divided by batch size is returned;
+             if `reduction` is ``'sum'``, the reduced sum loss is returned;
+             if `reduction` is ``'none'``, no reduction will be apllied.
+             Default is ``'mean'``.
 
     Shape:
 
@@ -654,11 +657,11 @@ class KLDivLoss(fluid.dygraph.Layer):
             x = np.random.uniform(-10, 10, shape).astype('float32')
             target = np.random.uniform(-10, 10, shape).astype('float32')
 
-            # 'batchmean' reduction, loss shape will be [N]
+            # 'batchmean' reduction, loss shape will be [1]
             kldiv_criterion = nn.KLDivLoss(reduction='batchmean')
             pred_loss = kldiv_criterion(paddle.to_tensor(x),
                                         paddle.to_tensor(target))
-            # shape=[5]
+            # shape=[1]
 
             # 'mean' reduction, loss shape will be [1]
             kldiv_criterion = nn.KLDivLoss(reduction='mean')
@@ -684,7 +687,7 @@ class KLDivLoss(fluid.dygraph.Layer):
         self.reduction = reduction
 
     def forward(self, input, label):
-        out = paddle.nn.functional.kl_div(input, label, self.reduction)
+        out = F.kl_div(input, label, self.reduction)
         return out
 
 
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index d13bf66ba5bfe483284e78dbcd2a42f8f3397210..2000fbf388f88d1da7119402104706a433cebf06 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -1130,10 +1130,10 @@ class SyncBatchNorm(_BatchNormBase):
         """
         layer_output = layer
         if isinstance(layer, _BatchNormBase):
-            layer_output = SyncBatchNorm(layer._num_features, layer._epsilon,
-                                         layer._momentum, layer._weight_attr,
-                                         layer._bias_attr, layer._data_format,
-                                         layer._name)
+            layer_output = SyncBatchNorm(
+                layer._num_features, layer._momentum, layer._epsilon,
+                layer._weight_attr, layer._bias_attr, layer._data_format,
+                layer._track_running_stats, layer._name)
 
             if layer._weight_attr != False and layer._bias_attr != False:
                 with no_grad():
diff --git a/python/paddle/nn/utils/weight_norm_hook.py b/python/paddle/nn/utils/weight_norm_hook.py
index ad53bf394660f3a7e0e48fdbd5eb530abd0852bb..7a21e7661d4e78d0004996ee67c80ddc35006bc3 100644
--- a/python/paddle/nn/utils/weight_norm_hook.py
+++ b/python/paddle/nn/utils/weight_norm_hook.py
@@ -112,6 +112,14 @@ class WeightNorm(object):
         if dim is None:
             dim = -1
 
+        # support dim is negative numeber, (dim = -1) == (dim = None)
+        weight_dim = len(layer._parameters[name].shape)
+        assert (
+            dim < weight_dim and dim >= -1 * weight_dim
+        ), "dim must set between [-R, R), R means the dimension of weight."
+        if dim != -1:
+            dim = (dim + weight_dim) % weight_dim
+
         fn = WeightNorm(name, dim)
 
         w = getattr(layer, name)
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index 3150b8c2d0363274dfb6fd3465110c89339cd4c9..708aaa788f60d56a2adb41c8a571079354b3c192 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -250,3 +250,47 @@ class Adam(Optimizer):
             stop_gradient=True)
 
         return adam_op
+
+    @framework.dygraph_only
+    def step(self):
+        """
+        Execute the optimizer and update parameters once.
+        
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import numpy as np
+                paddle.disable_static()
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.to_tensor(value)
+                linear = paddle.nn.Linear(13, 5)
+                # This can be any optimizer supported by dygraph.
+                adam = paddle.optimizer.Adam(learning_rate = 0.01, 
+                                            parameters = linear.parameters())
+                out = linear(a)
+                out.backward()
+                adam.step()
+                adam.clear_grad()
+        """
+        parameter_list = self._parameter_list
+        self._dtype = None
+        params_grads = []
+        for param in self._parameter_list:
+            if not param.trainable:
+                continue
+            if hasattr(
+                    param, "_is_sparse"
+            ) and param._is_sparse and self.regularization is not None:
+                raise RuntimeError(
+                    "Adam don't support weight_decay with sparse parameters, please set it to None."
+                )
+            if param._grad_ivar() is not None:
+                grad_var = param._grad_ivar()
+                params_grads.append((param, grad_var))
+
+        optimize_ops = self._apply_optimize(
+            loss=None, startup_program=None, params_grads=params_grads)
diff --git a/python/paddle/regularizer.py b/python/paddle/regularizer.py
index 2b20bb41970f0b1bd829585cd3767c6c08421f1e..b3f483fd89197c9bd0a447b4272e958824331942 100644
--- a/python/paddle/regularizer.py
+++ b/python/paddle/regularizer.py
@@ -12,8 +12,134 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define the regularizer functions 
-# __all__ = ['L1Decay',
-#            'L1DecayRegularizer',
-#            'L2Decay',
-#            'L2DecayRegularizer']
+__all__ = ['L1Decay', 'L2Decay']
+
+import paddle.fluid as fluid
+
+
+class L1Decay(fluid.regularizer.L1Decay):
+    """
+    Implement the L1 Weight Decay Regularization, which encourages the weights to be sparse.
+    
+    It can be set in :ref:`api_fluid_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ). 
+    When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in 
+    ``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has 
+    higher priority than ``optimizer`` , which means that for a trainable parameter, if regularizer is defined 
+    in its ParamAttr, then the regularizer in Optimizer will be ignored. Otherwise the  regularizer
+    in Optimizer will be used.
+    
+    In the implementation, the formula of L1 Weight Decay Regularization is as follows:
+	
+    .. math::
+
+        L1WeightDecay = reg\_coeff * sign(parameter)
+
+    Args:
+        coeff(float, optional): regularization coeff. Default:0.0.
+	
+    Examples:
+        .. code-block:: python
+
+            # Example1: set Regularizer in optimizer
+            import paddle
+            from paddle.regularizer import L1Decay
+            import numpy as np
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+            momentum = paddle.optimizer.Momentum(
+                learning_rate=0.1,
+                parameters=linear.parameters(),
+                weight_decay=L1Decay(0.0001))
+            back = out.backward()
+            momentum.step()
+            momentum.clear_grad()
+
+            # Example2: set Regularizer in parameters
+            # Set L1 regularization in parameters.
+            # Global regularizer does not take effect on my_conv2d for this case.
+            from paddle.nn import Conv2d
+            from paddle import ParamAttr
+            from paddle.regularizer import L2Decay
+
+            my_conv2d = Conv2d(
+                    in_channels=10,
+                    out_channels=10,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    weight_attr=ParamAttr(regularizer=L2Decay(coeff=0.01)),
+                    bias_attr=False)
+    """
+
+    def __init__(self, coeff=0.0):
+        super(L1Decay, self).__init__(coeff)
+
+
+class L2Decay(fluid.regularizer.L2Decay):
+    """
+    Implement the L2 Weight Decay Regularization, which helps to prevent the model over-fitting.
+    
+    It can be set in :ref:`api_fluid_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ). 
+    When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in 
+    ``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has 
+    higher priority than ``optimizer`` , which means that for a trainable parameter, if regularizer is defined 
+    in its ParamAttr, then the regularizer in Optimizer will be ignored. Otherwise the  regularizer
+    in Optimizer will be used.
+    
+    In the implementation, the formula of L2 Weight Decay Regularization is as follows:
+
+    .. math::
+
+        L2WeightDecay = reg\_coeff * parameter
+
+    Args:
+        regularization_coeff(float, optional): regularization coeff. Default:0.0
+	
+    Examples:
+        .. code-block:: python
+
+            # Example1: set Regularizer in optimizer
+            import paddle
+            from paddle.regularizer import L2Decay
+            import numpy as np
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+            momentum = paddle.optimizer.Momentum(
+                learning_rate=0.1,
+                parameters=linear.parameters(),
+                weight_decay=L2Decay(0.0001))
+            back = out.backward()
+            momentum.step()
+            momentum.clear_grad()
+
+            # Example2: set Regularizer in parameters
+            # Set L2 regularization in parameters.
+            # Global regularizer does not take effect on my_conv2d for this case.
+            from paddle.nn import Conv2d
+            from paddle import ParamAttr
+            from paddle.regularizer import L2Decay
+
+            my_conv2d = Conv2d(
+                    in_channels=10,
+                    out_channels=10,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    weight_attr=ParamAttr(regularizer=L2Decay(coeff=0.01)),
+                    bias_attr=False)
+    """
+
+    def __init__(self, coeff=0.0):
+        super(L2Decay, self).__init__(coeff)
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 0fed32a1676759bd94961af0a8949d035ec48c8f..8bb584be2362e7b02bc5b7c5603b148d37499c2d 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -40,6 +40,7 @@ from .creation import full_like  #DEFINE_ALIAS
 from .creation import triu  #DEFINE_ALIAS
 from .creation import tril  #DEFINE_ALIAS
 from .creation import meshgrid  #DEFINE_ALIAS
+from .creation import empty  #DEFINE_ALIAS
 from .io import save  #DEFINE_ALIAS
 from .io import load  #DEFINE_ALIAS
 from .linalg import matmul  #DEFINE_ALIAS
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 9eece1240d7d3c0b8a863091367e993047bd4527..8011b92964b7e21fd930f19cec954b27f470e0c6 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -48,6 +48,7 @@ __all__ = [
     'eye',
     'full',
     'full_like',
+    'empty',
     'triu',
     'tril',
     'meshgrid'
@@ -62,8 +63,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
 
     If the ``data`` is already a tensor, and ``dtype`` or ``place`` does't change, no copy 
     will be performed and return origin tensor, otherwise a new tensor will be constructed
-    and returned. Similarly, if the data is an numpy\.ndarray of with the same ``dtype`` 
-    and the current place is cpu, no copy will be performed.
+    and returned. 
 
     The ``ComplexTensor`` is a unique type of paddle. If x is ``ComplexTensor``, then 
     ``x.real`` is the real part, and ``x.imag`` is the imaginary part.
@@ -208,20 +208,20 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
             value=data,
             place=place,
             persistable=False,
-            zero_copy=True,
+            zero_copy=False,
             stop_gradient=stop_gradient)
     else:
         name = unique_name.generate('generated_tensor')
         real_tensor = paddle.Tensor(
             value=data.real,
             place=place,
-            zero_copy=True,
+            zero_copy=False,
             name=name + ".real",
             stop_gradient=stop_gradient)
         imag_tensor = paddle.Tensor(
             value=data.imag,
             place=place,
-            zero_copy=True,
+            zero_copy=False,
             name=name + ".imag",
             stop_gradient=stop_gradient)
         return paddle.ComplexTensor(real_tensor, imag_tensor)
@@ -981,3 +981,90 @@ def diag(x, offset=0, padding_value=0, name=None):
 
     out.stop_gradient = True
     return out
+
+
+def empty(shape, dtype=None, name=None):
+    """
+    This Op returns a Tensor with uninitialized data which size is same as ``shape``.
+    
+    Args:
+        shape(list|tuple|Tensor): Shape of the Tensor to be created.
+                The data type of dimension of shape is ``int32`` or ``int64`` . If ``shape`` is a list or tuple,
+                the elements of it should be integers or Tensors with shape [1].
+                If ``shape`` is an Tensor, it should be an 1-D Tensor.
+        dtype(np.dtype|str, optional): Data type of the output Tensor
+            which can be bool, float16, float32, float64, int32, int64, if dytpe is `None`, the data
+            type of created Tensor use global default dtype (see ``get_default_dtype``
+            for details).
+        name(str, optional): The default value is None. Normally there is no need for user to set this
+            property. For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        Tensor: Tensor which is created according to ``shape`` and ``dtype``, and is uninitialized.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()   # Now we are in imperative mode
+          paddle.set_device("cpu")  # and use cpu device
+
+          # example 1: argument ``shape`` is a list which doesn't contain Tensor.
+          data1 = paddle.empty(shape=[2,3], dtype='float32')
+          #[[4.3612203e+27 1.8176809e+31 1.3555911e-19]     # uninitialized
+          # [1.1699684e-19 1.3563156e-19 3.6408321e-11]]    # uninitialized
+
+          # example 2: argument ``shape`` is a Tensor, the data type must be int64 or int32.
+          shape_data = np.array([2, 3]).astype('int32')
+          shape = paddle.to_tensor(shape_data)
+          data2 = paddle.empty(shape=shape, dtype='float32')
+          #[[1.7192326e-37 4.8125365e-38 1.9866003e-36]     # uninitialized
+          # [1.3284029e-40 7.1117408e-37 2.5353012e+30]]    # uninitialized
+
+          # example 3: argument ``shape`` is a list which contains Tensor.
+          dim2_data = np.array([3]).astype('int32')
+          dim2 = paddle.to_tensor(dim2_data)
+          data3 = paddle.empty(shape=[2, dim2], dtype='float32')
+          #[[1.1024214e+24 7.0379409e+22 6.5737699e-34]     # uninitialized
+          # [7.5563101e+31 7.7130405e+31 2.8020654e+20]]    # uninitialized
+    """
+
+    if dtype is None:
+        dtype = paddle.get_default_dtype()
+
+    dtype = convert_dtype(dtype)
+
+    if in_dygraph_mode():
+        shape = utils.convert_shape_to_list(shape)
+        out = core.ops.empty('shape', shape, 'dtype',
+                             convert_np_dtype_to_dtype_(dtype))
+        out.stop_gradient = True
+        return out
+
+    helper = LayerHelper("empty", **locals())
+    inputs = {}
+
+    check_dtype(dtype, 'dtype',
+                ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+                'empty')
+    check_type(shape, 'shape', (Variable, list, tuple), 'empty')
+
+    if isinstance(shape, Variable):
+        check_dtype(shape.dtype, 'shape', ['int32', 'int64'], 'empty')
+
+    attrs = {}
+    utils.get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='empty')
+
+    out = helper.create_variable_for_type_inference(dtype=dtype)
+    attrs['dtype'] = convert_np_dtype_to_dtype_(dtype)
+    helper.append_op(
+        type='empty',
+        inputs=inputs,
+        outputs={'Out': [out]},
+        attrs=attrs,
+        stop_gradient=True)
+    out.stop_gradient = True
+    return out
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 7ddda5091a0a260f56b29bcedfdcb0786e82ddd6..67e3ce21ffba0c312eb01163cdf32f87c6433ee1 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -183,12 +183,13 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
         x (Tensor): The input tensor could be N-D tensor, and the input data
             type could be float32 or float64.
         p (float|string, optional): Order of the norm. Supported values are `fro`, `0`, `1`, `2`,
-           `inf`,`-inf` and any positive real number yielding the corresponding p-norm.
-            Not supported: ord < 0, nuclear norm.
+            `inf`, `-inf` and any positive real number yielding the corresponding p-norm. Not supported: ord < 0 and nuclear norm. 
+            Default value is `fro`.
         axis (int|list|tuple, optional): The axis on which to apply norm operation. If axis is int
             or list(int)/tuple(int)  with only one element, the vector norm is computed over the axis.
             If `axis < 0`, the dimension to norm operation is rank(input) + axis.
             If axis is a list(int)/tuple(int) with two elements, the matrix norm is computed over the axis.
+            Defalut value is `None`.
         keepdim (bool, optional): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have fewer dimension
             than the :attr:`input` unless :attr:`keepdim` is true, default
@@ -197,13 +198,9 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
             user to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Variable: Tensor, results of norm operation on the specified axis of input tensor,
+        Tensor: results of norm operation on the specified axis of input tensor,
         it's data type is the same as input's Tensor.
  
-    Raises:
-        TypeError, if out data type is different with the input data type.
-        ValueError, If `p` or `axis` is invalid.
-    
     Examples:
         .. code-block:: python
             
@@ -256,15 +253,13 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
                 "The dim of frobenius norm op should be None or two elements list!"
             )
         if in_dygraph_mode():
-            if dim is None: dim = [-1]
-            return core.ops.frobenius_norm(input, 'dim', dim, 'keepdim',
-                                           keepdim)
-        attrs = {
-            'dim': dim if dim != None else [-2, -1],
-            'keep_dim': keepdim,
-            'reduce_all': False
-        }
-        if len(attrs['dim']) == len(input.shape):
+            if dim is None:
+                return core.ops.frobenius_norm(input, 'keep_dim', keepdim,
+                                               'reduce_all', True)
+            return core.ops.frobenius_norm(input, 'dim', dim, 'keep_dim',
+                                           keepdim, 'reduce_all', False)
+        attrs = {'dim': dim, 'keep_dim': keepdim, 'reduce_all': False}
+        if dim is None:
             attrs['reduce_all'] = True
         check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                                  'frobenius_norm')
@@ -351,42 +346,6 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
 
         return reduce_out
 
-    def p0_matrix_norm(input, porder=0., axis=axis, keepdim=False, name=None):
-        block = LayerHelper('norm', **locals())
-        out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
-
-        cast_out = block.create_variable_for_type_inference(dtype=bool)
-        block.append_op(
-            type='cast',
-            inputs={'X': input},
-            outputs={'Out': cast_out},
-            attrs={
-                'in_dtype': input.dtype,
-                'out_dtype': int(core.VarDesc.VarType.BOOL)
-            })
-        cast_out2 = block.create_variable_for_type_inference(dtype=bool)
-        block.append_op(
-            type='cast',
-            inputs={'X': cast_out},
-            outputs={'Out': cast_out2},
-            attrs={
-                'in_dtype': cast_out.dtype,
-                'out_dtype': int(core.VarDesc.VarType.FP32)
-            })
-        sum_out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
-        block.append_op(
-            type='reduce_sum',
-            inputs={'X': cast_out2},
-            outputs={'Out': sum_out},
-            attrs={
-                'dim': axis,
-                'keep_dim': keepdim,
-                'reduce_all': True if axis is None else False
-            })
-        return sum_out
-
     def p_matrix_norm(input, porder=1., axis=axis, keepdim=False, name=None):
         block = LayerHelper('norm', **locals())
         out = block.create_variable_for_type_inference(
@@ -448,7 +407,20 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
 
     #calculate vector norm, where axis is int or list with only one integer
     if isinstance(axis, int):
-        if isinstance(p, (int, float)):
+        if isinstance(p, str):
+            if p == "fro":
+                return vector_norm(
+                    x,
+                    porder=2,
+                    axis=axis,
+                    keepdim=keepdim,
+                    asvector=False,
+                    name=name)
+
+            else:
+                raise ValueError(
+                    "only valid string values are 'fro', found {}".format(p))
+        elif isinstance(p, (int, float)):
             return vector_norm(
                 x,
                 axis=axis,
@@ -464,10 +436,12 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
     elif isinstance(axis, list) and len(axis) == 2:
         if p == "fro":
             return frobenius_norm(x, dim=axis, keepdim=keepdim, name=name)
-        elif p == 0:
-            return p0_matrix_norm(x, axis=axis, keepdim=keepdim, name=name)
         elif p == np.inf or p == -np.inf:
             return inf_norm(x, porder=p, axis=axis, keepdim=keepdim, name=name)
+        elif p == 0:
+            raise ValueError(
+                "just suport axis type int or list (length of list <=1) if p = 0, found {}".
+                format(axis))
         else:
             return p_matrix_norm(
                 x, porder=p, axis=axis, keepdim=keepdim, name=name)
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 363c3ffceb85ef6168dc8c33b81185cac08083fb..9de407841fb461713d00f997afdf33a38a531245 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -21,6 +21,7 @@ from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_t
 from ..fluid.layers.tensor import fill_constant
 from ..fluid.layers import utils
 import numpy as np
+import six
 # TODO: define functions to manipulate a tensor  
 from ..fluid.layers import cast  #DEFINE_ALIAS
 from ..fluid.layers import slice  #DEFINE_ALIAS
@@ -746,8 +747,6 @@ def unsqueeze(x, axis, name=None):
             print(out3.shape)  # [1, 1, 1, 5, 10]
             
     """
-    if isinstance(axis, int):
-        axis = [axis]
 
     return layers.unsqueeze(x, axis, name)
 
@@ -1001,7 +1000,7 @@ def chunk(x, chunks, axis=0, name=None):
             x_np = np.random.random([3, 9, 5]).astype("int32")
             x = paddle.to_tensor(x_np)
 
-            out0, out1, out22 = paddle.chunk(x, chunks=3, axis=1)
+            out0, out1, out2 = paddle.chunk(x, chunks=3, axis=1)
             # out0.shape [3, 3, 5]
             # out1.shape [3, 3, 5]
             # out2.shape [3, 3, 5]
@@ -1058,10 +1057,25 @@ def tile(x, repeat_times, name=None):
     """
     if in_dygraph_mode():
         return core.ops.tile(x, 'repeat_times', repeat_times)
+    check_type(repeat_times, 'repeat_times', (list, tuple, Variable), 'tile')
+    if isinstance(repeat_times, Variable):
+        assert len(repeat_times.shape) == 1, (
+            'repeat_times must be an 1-D Tensor.')
+    else:
+        for elem in repeat_times:
+            if isinstance(elem, Variable):
+                assert len(elem.shape) == 1, (
+                    'Elements in repeat_times must be 1-D Tensors or integers.')
+            else:
+                if six.PY3:
+                    type_tuple = (int, np.int32, np.int64)
+                elif six.PY2:
+                    type_tuple = (int, long, np.int32, np.int64)
+                assert isinstance(elem, type_tuple), (
+                    'Elements in repeat_times must be 1-D Tensors or integers.')
 
     check_variable_and_dtype(
         x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'tile')
-    check_type(repeat_times, 'repeat_times', (list, tuple, Variable), 'tile')
     if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False:
         raise ValueError(
             "When the date type is bool for the input 'x' of tile op, you "
@@ -1183,18 +1197,33 @@ def expand(x, shape, name=None):
     if in_dygraph_mode():
         return core.ops.expand_v2(x, 'shape', shape)
 
+    if isinstance(shape, Variable):
+        assert len(shape.shape) == 1, ('shape must be an 1-D Tensor.')
+    else:
+        for elem in shape:
+            if isinstance(elem, Variable):
+                assert len(elem.shape) == 1, (
+                    'Elements in shape must be 1-D Tensors or integers.')
+            else:
+                if six.PY3:
+                    type_tuple = (int, np.int32, np.int64)
+                elif six.PY2:
+                    type_tuple = (int, long, np.int32, np.int64)
+                assert isinstance(elem, type_tuple), (
+                    'Elements in shape must be 1-D Tensors or integers.')
+
     check_variable_and_dtype(
         x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'expand')
     check_type(shape, 'shape', (list, tuple, Variable), 'expand')
-
-    inputs = {"X": [x]}
-    attrs = {}
     if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False:
         raise ValueError("When the data type of input 'x' for expand is bool, "
                          "you must set its stop_gradient to be False by "
                          "some_var.stop_gradient = True, supporting "
                          "some_var as the input.")
 
+    inputs = {"X": [x]}
+    attrs = {}
+
     helper = LayerHelper('expand', **locals())
 
     def get_attr_expand_shape(list_expand_shape):
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index ed2bbe03a366054dfe7d798310c7fa5d419b44a8..966544c7abb54ae7de163aa322890a55ee94d3d8 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -64,7 +64,6 @@ from ..fluid.layers import increment    #DEFINE_ALIAS
 from ..fluid.layers import multiplex    #DEFINE_ALIAS
 from ..fluid.layers import sums    #DEFINE_ALIAS
 from ..fluid import layers
-import paddle
 
 
 __all__ = [
@@ -343,69 +342,9 @@ def divide(x, y, name=None):
     axis = -1
     act = None
     if in_dygraph_mode():
-        # rule 1 : avoid numpy.ndarray
-        if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
-            raise TypeError("divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
-
-        # rule 2: both the inputs are not Tensor
-        elif not isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
-            x = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=x)
-            y = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=y)
-
-        # rule 3: both the inputs are Tensor
-        elif isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
-            if y.dtype != x.dtype:
-                raise TypeError("divide(): argument position 1 and argument position 2 must have the same dtype."
-                                "But x is {}, y is {}".format(x.dtype, y.dtype))
-            elif x.dtype in _supported_int_dtype_:
-                x = x.astype(paddle.get_default_dtype())
-                y = y.astype(paddle.get_default_dtype())
-
-        # rule 4: x is Tensor, y is scalar
-        elif isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
-            if x.dtype in _supported_int_dtype_:
-                x = x.astype(paddle.get_default_dtype())
-            y = paddle.full(shape=[1], dtype=x.dtype, fill_value=y)
-
-        # rule 5: x is scalar, y is Tensor
-        elif not isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
-            if y.dtype in _supported_int_dtype_:
-                y = y.astype(paddle.get_default_dtype())
-            x = paddle.full(shape=[1], dtype=y.dtype, fill_value=x)
-
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
 
-    # rule 1 : avoid numpy.ndarray
-    if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
-        raise TypeError("divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
-
-    # rule 2: both the inputs are not Tensor
-    elif not isinstance(x, Variable) and not isinstance(y, Variable):
-        x = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=x)
-        y = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=y)
-
-    # rule 3: both the inputs are Tensor
-    elif isinstance(x, Variable) and isinstance(y, Variable):
-        if y.dtype != x.dtype:
-            raise TypeError("divide(): argument position 1 and argument position 2 must have the same dtype."
-                            "But x is {}, y is {}".format(x.dtype, y.dtype))
-        elif x.dtype in _supported_int_dtype_:
-            x = paddle.cast(x, paddle.get_default_dtype())
-            y = paddle.cast(y, paddle.get_default_dtype())
-
-    # rule 4: x is Tensor, y is scalar
-    elif isinstance(x, Variable) and not isinstance(y, Variable):
-        if x.dtype in _supported_int_dtype_:
-            x = paddle.cast(x, paddle.get_default_dtype())
-        y = paddle.fill_constant(shape=[1], dtype=x.dtype, value=y)
-
-    # rule 5: x is scalar, y is Tensor
-    elif not isinstance(x, Variable) and isinstance(y, Variable):
-        if y.dtype in _supported_int_dtype_:
-            y = paddle.cast(y, paddle.get_default_dtype())
-        x = paddle.fill_constant(shape=[1], dtype=y.dtype, value=x)
-
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
 
@@ -444,55 +383,9 @@ def floor_divide(x, y, name=None):
     op_type = 'elementwise_floordiv'
     axis = -1
     if in_dygraph_mode():
-        # rule 1 : avoid numpy.ndarray
-        if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
-            raise TypeError("floor_divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
-
-        # rule 2: both the inputs are not Tensor
-        elif not isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
-            x = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=x)
-            y = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=y)
-
-        # rule 3: both the inputs are Tensor
-        elif isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
-            if y.dtype != x.dtype:
-                raise TypeError("floor_divide(): argument position 1 and argument position 2 must have the same dtype."
-                                "But x is {}, y is {}".format(x.dtype, y.dtype))
-
-        # rule 4: x is Tensor, y is scalar
-        elif isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
-            y = paddle.full(shape=[1], dtype=x.dtype, fill_value=y)
-
-        # rule 5: x is scalar, y is Tensor
-        elif not isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
-            x = paddle.full(shape=[1], dtype=y.dtype, fill_value=x)
-
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, op_name=op_type)
 
-    # rule 1 : avoid numpy.ndarray
-    if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
-        raise TypeError("divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
-
-    # rule 2: both the inputs are not Tensor
-    elif not isinstance(x, Variable) and not isinstance(y, Variable):
-        x = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=x)
-        y = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=y)
-
-    # rule 3: both the inputs are Tensor
-    elif isinstance(x, Variable) and isinstance(y, Variable):
-        if y.dtype != x.dtype:
-            raise TypeError("divide(): argument position 1 and argument position 2 must have the same dtype."
-                            "But x is {}, y is {}".format(x.dtype, y.dtype))
-
-    # rule 4: x is Tensor, y is scalar
-    elif isinstance(x, Variable) and not isinstance(y, Variable):
-        y = paddle.fill_constant(shape=[1], dtype=x.dtype, value=y)
-
-    # rule 5: x is scalar, y is Tensor
-    elif not isinstance(x, Variable) and isinstance(y, Variable):
-        x = paddle.fill_constant(shape=[1], dtype=y.dtype, value=x)
-
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
 
@@ -531,43 +424,9 @@ def remainder(x, y, name=None):
     op_type = 'elementwise_mod'
     axis = -1
     if in_dygraph_mode():
-        # rule 1 : avoid numpy.ndarray
-        if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
-            raise TypeError("remainder(): arguments must be Tensor or scalar, not numpy.ndarray.")
-
-        elif not isinstance(x, paddle.Tensor):
-            raise TypeError("remainder(): arguments position 1 must be Tensor, not {}".format(type(x)))
-
-        # rule 3: both the inputs are Tensor
-        elif isinstance(y, paddle.Tensor):
-            if y.dtype != x.dtype:
-                raise TypeError("remainder(): argument position 1 and argument position 2 must have the same dtype."
-                                "But x is {}, y is {}".format(x.dtype, y.dtype))
-
-        # rule 4: x is Tensor, y is scalar
-        elif not isinstance(y, paddle.Tensor):
-            y = paddle.full(shape=[1], dtype=x.dtype, fill_value=y)
-
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, op_name=op_type)
 
-    # rule 1 : avoid numpy.ndarray
-    if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
-        raise TypeError("remainder(): arguments must be Tensor or scalar, not numpy.ndarray.")
-
-    elif not isinstance(x, Variable):
-        raise TypeError("remainder(): arguments position 1 must be Tensor, not {}".format(type(x)))
-
-    # rule 3: both the inputs are Tensor
-    elif isinstance(y, Variable):
-        if y.dtype != x.dtype:
-            raise TypeError("remainder(): argument position 1 and argument position 2 must have the same dtype."
-                            "But x is {}, y is {}".format(x.dtype, y.dtype))
-
-    # rule 4: x is Tensor, y is scalar
-    elif not isinstance(y, paddle.Tensor):
-        y = paddle.fill_constant(shape=[1], dtype=x.dtype, value=y)
-
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
 
@@ -1194,15 +1053,14 @@ def logsumexp(x, axis=None, keepdim=False, name=None):
         axis = [0]
 
     if in_dygraph_mode():
-        return core.ops.logsumexp(x, 'dim', axis, 'keep_dim', keepdim,
-                                    'reduce_all', reduce_all)
+        return core.ops.logsumexp(x, 'axis', axis, 'keepdim', keepdim, 'reduce_all', reduce_all)
 
     check_variable_and_dtype(x, 'x',
                              ['float32', 'float64'],
                              'logsumexp')
 
     helper = LayerHelper('logsumexp', **locals())
-    attrs = {'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all}
+    attrs = {'axis': axis, 'keepdim': keepdim, 'reduce_all':reduce_all}
     out = helper.create_variable_for_type_inference(x.dtype)
     helper.append_op(
         type='logsumexp', inputs={'X': x}, outputs={'Out': out}, attrs=attrs)
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index ce03d0ef15f0f80f4e01cf57bc8cc449186c2560..f55d285586f0ec6959573af64e720bea5de10c8d 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -167,10 +167,10 @@ def argmax(x, axis=None, keepdim=False, dtype="int64", name=None):
             "The type of 'axis'  must be int or None in argmax, but received %s."
             % (type(axis)))
 
-    if not (isinstance(dtype, str) or isinstance(dtype, np.dtype)):
-        raise TypeError(
-            "the type of 'dtype' in argmax must be str or np.dtype, but received {}".
-            format(type(dtype)))
+    if dtype is None:
+        raise ValueError(
+            "the value of 'dtype' in argmax could not be None, but received None"
+        )
 
     var_dtype = convert_np_dtype_to_dtype_(dtype)
     check_dtype(var_dtype, 'dtype', ['int32', 'int64'], 'argmin')
@@ -245,10 +245,10 @@ def argmin(x, axis=None, keepdim=False, dtype="int64", name=None):
             "The type of 'axis'  must be int or None in argmin, but received %s."
             % (type(axis)))
 
-    if not (isinstance(dtype, str) or isinstance(dtype, np.dtype)):
-        raise TypeError(
-            "the type of 'dtype' in argmin must be str or np.dtype, but received {}".
-            format(dtype(dtype)))
+    if dtype is None:
+        raise ValueError(
+            "the value of 'dtype' in argmin could not be None, but received None"
+        )
 
     var_dtype = convert_np_dtype_to_dtype_(dtype)
     check_dtype(var_dtype, 'dtype', ['int32', 'int64'], 'argmin')
diff --git a/python/paddle/tests/test_dist_hapi_model.py b/python/paddle/tests/test_dist_hapi_model.py
index e75e08e3749e6ce629e88c486e4f87d9109dc709..db5b63c5ae0e29fa6f1274befd277c4e46c3a1b1 100644
--- a/python/paddle/tests/test_dist_hapi_model.py
+++ b/python/paddle/tests/test_dist_hapi_model.py
@@ -37,7 +37,11 @@ def get_cluster_from_args(selected_gpus):
     free_ports = find_free_ports(len(selected_gpus))
     if free_ports is not None:
         free_ports = list(free_ports)
-    return get_cluster(node_ips, node_ip, free_ports, selected_gpus)
+
+    trainer_endpoints = []
+    for ip in node_ips:
+        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
+    return get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus)
 
 
 def get_gpus(selected_gpus):
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index b7b5d44650f8d62926241a57feedfd5b932a37f5..5c4e98feaa686217bc78ad3915423593ad4fcdce 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -523,6 +523,24 @@ class TestModelFunction(unittest.TestCase):
             model.summary(input_size=[(20)])
             model.summary(input_size=(20), batch_size=2)
 
+    def test_summary_nlp(self):
+        paddle.enable_static()
+        nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
+        paddle.summary(nlp_net, (1, 2))
+
+    def test_summary_error(self):
+        with self.assertRaises(TypeError):
+            nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
+            paddle.summary(nlp_net, (1, '2'))
+
+        with self.assertRaises(ValueError):
+            nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
+            paddle.summary(nlp_net, (-1, -1))
+
+        paddle.disable_static()
+        nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
+        paddle.summary(nlp_net, (1, 2))
+
     def test_export_deploy_model(self):
         for dynamic in [True, False]:
             fluid.enable_dygraph() if dynamic else None
diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
index 2a649c776b4103b1d3d8648957bbff7a32007410..4a786679727fb1b42c216146685e0e6524e858c9 100644
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
@@ -16,12 +16,13 @@ from .profiler import ProfilerOptions
 from .profiler import Profiler
 from .profiler import get_profiler
 from .deprecated import deprecated
+from ..fluid.framework import unique_name
+from ..fluid.framework import load_op_library
+from ..fluid.framework import require_version
 
 from . import download
 
 __all__ = ['dump_config', 'deprecated', 'download']
 
 #TODO: define new api under this directory
-# __all__ = ['unique_name',
-#            'load_op_library',
-#            'require_version']
+__all__ += ['unique_name', 'load_op_library', 'require_version']
diff --git a/python/setup.py.in b/python/setup.py.in
index 64ac2b9b9a4d210c59193e117c6000986bfb07a0..773166400347ab550f82e4fabcb0d89b90818fc2 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -156,6 +156,7 @@ packages=['paddle',
           'paddle.framework',
           'paddle.jit',
           'paddle.fluid',
+          'paddle.fluid.inference',
           'paddle.fluid.dygraph',
           'paddle.fluid.dygraph.dygraph_to_static',
           'paddle.fluid.dygraph.amp',
diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3b6a7546616678c21d4241da42f0965d4b85f2e6
--- /dev/null
+++ b/python/unittest_py/requirements.txt
@@ -0,0 +1,4 @@
+PyGithub
+coverage
+pycrypto
+mock
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 1e5179d0282d7f35c4232d9b9783cb831e83f462..84254cc89bb8eef12a95189416cd29cce828f5ca 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -287,12 +287,19 @@ fi
 pip install PyGithub
 # For getting PR related data
 wget https://paddle-ci.gz.bcebos.com/blk/block.txt --no-check-certificate
+wget https://sys-p0.bj.bcebos.com/bk-ci/bk.txt --no-check-certificate
 HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has unit-test to be fixed" || true`
 if [ "${HASUTFIXED}" != "" ]; then
   echo_line="${HASUTFIXED} You must have one RD (chalsliu (Recommend) or kolinwei) approval.\n"
   check_approval 1 45041955 22165420
 fi
 
+HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has benchmark issue to be fixed" || true`
+if [ "${HASUTFIXED}" != "" ]; then
+    echo_line="${HASUTFIXED} You must have one RD (hysunflower or xiegegege or Xreki) approval.\n"
+  check_approval 1 52739577 46314656 12538138
+fi
+
 if [ -n "${echo_list}" ];then
   echo "****************"
   echo -e "${echo_list[@]}"
diff --git a/tools/check_ut.py b/tools/check_ut.py
index 7b5e5a4f1c55574edc3f28dac76ebf1d932748d7..f5fe4c687dd7828f001ddbab744d66931e37f532 100644
--- a/tools/check_ut.py
+++ b/tools/check_ut.py
@@ -27,9 +27,12 @@ class PRChecker(object):
         self.github = Github(os.getenv('GITHUB_API_TOKEN'), timeout=60)
         self.repo = None
 
-    def check(self):
-        """ check pr. """
-        filename = 'block.txt'
+    def check(self, filename, msg):
+        """ 
+        Args:
+            filename (str): File to get block names.  
+            msg (str): Error message.  
+        """
         pr_id = os.getenv('GIT_PR_ID')
         if not pr_id:
             print('No PR ID')
@@ -44,12 +47,10 @@ class PRChecker(object):
         with open(filename) as f:
             for l in f:
                 if l.rstrip('\r\n') == user:
-                    print('{} has unit-test to be fixed, so CI failed.'.format(
-                        user))
-                    exit(1)
-        exit(0)
+                    print('{} {}'.format(user, msg))
 
 
 if __name__ == '__main__':
     pr_checker = PRChecker()
-    pr_checker.check()
+    pr_checker.check('block.txt', 'has unit-test to be fixed, so CI failed.')
+    pr_checker.check('bk.txt', 'has benchmark issue to be fixed, so CI failed.')
diff --git a/tools/coverage/coverage_diff.py b/tools/coverage/coverage_diff.py
index 051348d358f459a67d39dd9ca798721a82aa2233..38f671fe4089d1f94caafaf26640e4df75870f55 100644
--- a/tools/coverage/coverage_diff.py
+++ b/tools/coverage/coverage_diff.py
@@ -90,12 +90,12 @@ def get_info_file_lines(info_file, diff_file):
                 continue
 
             elif line.startswith('LF:'):
-                print 'LF:{}'.format(current_lf)
+                print('LF:{}'.format(current_lf))
 
                 continue
 
             elif line.startswith('LH:'):
-                print 'LH:{}'.format(current_lh)
+                print('LH:{}'.format(current_lh))
 
                 continue
 
diff --git a/tools/coverage/coverage_diff_list.py b/tools/coverage/coverage_diff_list.py
index 57222da4d9818ebbcb0b9aeea6937494038eecdf..8975185edadfbd567a428bbd90523923f5ab675d 100644
--- a/tools/coverage/coverage_diff_list.py
+++ b/tools/coverage/coverage_diff_list.py
@@ -40,7 +40,7 @@ def filter_by(list_file, max_rate):
             except:
                 pass
 
-            print name, rate
+            print(name, rate)
 
 
 if __name__ == '__main__':
diff --git a/tools/coverage/coverage_lines.py b/tools/coverage/coverage_lines.py
index eb846cc9f249a3ff177fde437362a1122f4409a5..cdec5b8b1bb1873f8b9ef761e9d8575c89fee234 100644
--- a/tools/coverage/coverage_lines.py
+++ b/tools/coverage/coverage_lines.py
@@ -33,7 +33,7 @@ def get_lines(info_file):
                 hits += 1
 
     if total == 0:
-        print 'no data found'
+        print('no data found')
         exit()
 
     return hits / total
@@ -47,17 +47,17 @@ if __name__ == '__main__':
     expected = float(sys.argv[2])
 
     if not os.path.isfile(info_file):
-        print 'info file {} is not exists, ignored'.format(info_file)
+        print('info file {} is not exists, ignored'.format(info_file))
         exit()
 
     actual = get_lines(info_file)
     actual = round(actual, 3)
 
     if actual < expected:
-        print 'expected >= {} %, actual {} %, failed'.format(
-            round(expected * 100, 1), round(actual * 100, 1))
+        print('expected >= {} %, actual {} %, failed'.format(
+            round(expected * 100, 1), round(actual * 100, 1)))
 
         exit(1)
 
-    print 'expected >= {} %, actual {} %, passed'.format(
-        round(expected * 100, 1), round(actual * 100, 1))
+    print('expected >= {} %, actual {} %, passed'.format(
+        round(expected * 100, 1), round(actual * 100, 1)))
diff --git a/tools/coverage/paddle_coverage.sh b/tools/coverage/paddle_coverage.sh
index d54434b738db5b5e6192f7d3bf9e48edbebc5b7c..008b35d01ca565a6d32265f595dd2d6aa55be707 100644
--- a/tools/coverage/paddle_coverage.sh
+++ b/tools/coverage/paddle_coverage.sh
@@ -5,7 +5,7 @@ set -xe
 PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
 
 # install lcov
-curl -o /lcov-1.14.tar.gz -x "" -s https://paddle-ci.gz.bcebos.com/coverage/lcov-1.14.tar.gz
+curl -o /lcov-1.14.tar.gz -x "" -s https://paddle-ci.gz.bcebos.com/coverage/lcov-1.14.tar.gz || exit 101
 tar -xf /lcov-1.14.tar.gz -C /
 cd /lcov-1.14
 make install
@@ -14,7 +14,7 @@ make install
 
 cd /paddle/build
 
-python ${PADDLE_ROOT}/tools/coverage/gcda_clean.py ${GIT_PR_ID}
+python3 ${PADDLE_ROOT}/tools/coverage/gcda_clean.py ${GIT_PR_ID}
 
 lcov --capture -d ./ -o coverage.info --rc lcov_branch_coverage=0
 
@@ -53,9 +53,9 @@ gen_full_html_report || true
 function gen_diff_html_report() {
     if [ "${GIT_PR_ID}" != "" ]; then
 
-        COVERAGE_DIFF_PATTERN="`python ${PADDLE_ROOT}/tools/coverage/pull_request.py files ${GIT_PR_ID}`"
+        COVERAGE_DIFF_PATTERN="`python3 ${PADDLE_ROOT}/tools/coverage/pull_request.py files ${GIT_PR_ID}`"
 
-        python ${PADDLE_ROOT}/tools/coverage/pull_request.py diff ${GIT_PR_ID} > git-diff.out
+        python3 ${PADDLE_ROOT}/tools/coverage/pull_request.py diff ${GIT_PR_ID} > git-diff.out
     fi
 
     lcov --extract coverage-full.info \
@@ -63,7 +63,7 @@ function gen_diff_html_report() {
         -o coverage-diff.info \
         --rc lcov_branch_coverage=0
 
-    python ${PADDLE_ROOT}/tools/coverage/coverage_diff.py coverage-diff.info git-diff.out > coverage-diff.tmp
+    python3 ${PADDLE_ROOT}/tools/coverage/coverage_diff.py coverage-diff.info git-diff.out > coverage-diff.tmp
 
     mv -f coverage-diff.tmp coverage-diff.info
 
@@ -82,7 +82,7 @@ set -x
 
 coverage xml -i -o python-coverage.xml
 
-python ${PADDLE_ROOT}/tools/coverage/python_coverage.py > python-coverage.info
+python3 ${PADDLE_ROOT}/tools/coverage/python_coverage.py > python-coverage.info
 
 # python full html report
 #
@@ -143,5 +143,6 @@ echo "Assert Python Diff Coverage"
 python ${PADDLE_ROOT}/tools/coverage/coverage_lines.py python-coverage-diff.info 0.9 || PYTHON_COVERAGE_LINES_ASSERT=1
 
 if [ "$COVERAGE_LINES_ASSERT" = "1" ] || [ "$PYTHON_COVERAGE_LINES_ASSERT" = "1" ]; then
+    echo "exit 9" > /tmp/paddle_coverage.result
     exit 9
 fi
diff --git a/tools/coverage/pull_request.py b/tools/coverage/pull_request.py
index 979f476d2a1f3fbf28d43e2b717deb7b2d1b0fff..105460032f7db538eaf7a193776bf8085e2837a1 100644
--- a/tools/coverage/pull_request.py
+++ b/tools/coverage/pull_request.py
@@ -40,7 +40,7 @@ def get_files(args):
     pull = get_pull(args.pull_id)
 
     for file in pull.get_files():
-        print '/paddle/{}'.format(file.filename)
+        print('/paddle/{}'.format(file.filename))
 
 
 def diff(args):
@@ -55,8 +55,8 @@ def diff(args):
     pull = get_pull(args.pull_id)
 
     for file in pull.get_files():
-        print '+++ {}'.format(file.filename)
-        print file.patch
+        print('+++ {}'.format(file.filename))
+        print(file.patch)
 
 
 if __name__ == '__main__':
diff --git a/tools/coverage/python_coverage.py b/tools/coverage/python_coverage.py
index ba67e12249bb0ccee608ea120321eaa3a2ccbc91..8ad9d85c1bf6b5ed542fb8469173c4f1815050a4 100644
--- a/tools/coverage/python_coverage.py
+++ b/tools/coverage/python_coverage.py
@@ -12,10 +12,7 @@ root = tree.getroot()
 
 sources = root.findall('sources/source')
 
-if len(sources) > 1:
-    exit(1)
-
-source = sources[0].text
+source = sources[-1].text
 
 for clazz in root.findall('packages/package/classes/class'):
     clazz_filename = clazz.attrib.get('filename')
@@ -28,8 +25,8 @@ for clazz in root.findall('packages/package/classes/class'):
     if not path.exists(clazz_filename):
         continue
 
-    print 'TN:'
-    print 'SF:{}'.format(clazz_filename)
+    print('TN:')
+    print('SF:{}'.format(clazz_filename))
 
     branch_index = 0
 
@@ -50,16 +47,16 @@ for clazz in root.findall('packages/package/classes/class'):
             taken = int(taken)
 
             for _ in range(taken):
-                print 'BRDA:{},{},{},{}'.format(line_number, 0, branch_index,
-                                                line_hits)
+                print('BRDA:{},{},{},{}'.format(line_number, 0, branch_index,
+                                                line_hits))
                 branch_index += 1
 
             if line_missing_branches:
                 for missing_branch in line_missing_branches.split(','):
-                    print 'BRDA:{},{},{},{}'.format(line_number, 0,
-                                                    branch_index, 0)
+                    print('BRDA:{},{},{},{}'.format(line_number, 0,
+                                                    branch_index, 0))
                     branch_index += 1
 
-        print 'DA:{},{}'.format(line_number, line_hits)
+        print('DA:{},{}'.format(line_number, line_hits))
 
-    print 'end_of_record'
+    print('end_of_record')
diff --git a/tools/enforce/count_all_enforce.sh b/tools/enforce/count_all_enforce.sh
index c1b7508de0361b7a9036557f88fd0b10f326dcc6..683b73614d29bb42871c63dc94d365626d0375ad 100644
--- a/tools/enforce/count_all_enforce.sh
+++ b/tools/enforce/count_all_enforce.sh
@@ -39,7 +39,7 @@
 #     Valid PADDLE_ENFORCE{_**} & PADDLE_THROW Count: 1706
 #     Invalid PADDLE_ENFORCE{_**} & PADDLE_THROW Count: 4572
 
-ROOT_DIR=../paddle/fluid
+ROOT_DIR=../../paddle/fluid
 ALL_PADDLE_CHECK_CNT=0
 VALID_PADDLE_CHECK_CNT=0
 
diff --git a/tools/enforce/count_enforce_by_dir.sh b/tools/enforce/count_enforce_by_dir.sh
index 03233d417ac88eef775e1ca6a77d0600a4faa361..3cb13edf7cc27f6a0de45080a7b90e4b4e24b6b5 100644
--- a/tools/enforce/count_enforce_by_dir.sh
+++ b/tools/enforce/count_enforce_by_dir.sh
@@ -59,7 +59,7 @@
 
 . ./count_all_enforce.sh --source-only
 
-ROOT_DIR=../paddle/fluid
+ROOT_DIR=../../paddle/fluid
 
 function count_dir_independently(){
     local sub_dir_total_check_cnt=0
diff --git a/tools/windows/build_compile_environment.bat b/tools/windows/build_compile_environment.bat
new file mode 100644
index 0000000000000000000000000000000000000000..16665ac4aafddca323c2f453f5fcdd78aa0949ed
--- /dev/null
+++ b/tools/windows/build_compile_environment.bat
@@ -0,0 +1,190 @@
+:: Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+::
+:: ===============================
+:: Build Paddle compile enviroment
+:: ===============================
+:: Description:
+::   
+::   Install compile enviroment for xly CI.
+::
+::   Include:
+::     1. CMake 3.17.0
+::     2. Git 2.28.0
+::     3. Python 3.7.8
+::     4. Visual Studio 2015 with update 3
+::     5. CUDA 10 [miss cudnn]
+::     6. java jre [not complete]
+::     7. xly agent [not complete]
+
+:: Echo command is not required.
+@echo off
+
+:: ===== start step 0: wget tool =====
+:: Download wget for windows when there is not wget tool.
+echo ">>>>>>>> step [0/7]: wget tool"
+wget --help > nul 2> nul || call:install_wget
+goto cmake
+
+:install_wget
+echo There is not wget in this PC, will download wget 1.20.
+echo Download package from https://eternallybored.org/misc/wget/1.20/64/wget.exe ...
+certutil -urlcache -split -f https://eternallybored.org/misc/wget/1.20/64/wget.exe > nul 2> nul
+if %errorlevel% == 0 (
+  echo Download wget tool into %cd% success.
+) else (
+  echo Error***** Download wget tool failed, please download it before rerun.
+  exit /b 1
+) 
+goto :eof
+:: ===== end step 0: wget tool =====
+
+:: ===== start step 1: cmake =====
+:: Download CMake-3.17.0 and add in PATH when it not installed.
+:: TODO: limit version >= 3.17.0
+:cmake
+echo ">>>>>>>> step [1/7]: CMake 3.17.0"
+cmake --help > nul 2> nul || call :install_cmake
+goto git
+
+:install_cmake
+echo There is not cmake in this PC, will install cmake-3.17.0.
+echo Download package from https://cmake.org/files/v3.17/cmake-3.17.0-win64-x64.msi ...
+wget -O cmake-3.17.0-win64-x64.msi https://cmake.org/files/v3.17/cmake-3.17.0-win64-x64.msi
+echo Install cmake-3.17.0 ...
+:: /passive [silent installation]
+:: /norestart [do not restart]
+:: ADD_CMAKE_TO_PATH = System [add CMake to the system PATH for all users]
+start /wait cmake-3.17.0-win64-x64.msi /passive /norestart ADD_CMAKE_TO_PATH=System
+if %errorlevel% == 0 (
+  echo Install CMake-3.17.0 success!
+) else (
+  echo Error***** Install Cmake-3.17.0 failed, please re-install it manually.
+)
+del cmake-3.17.0-win64-x64.msi
+goto :eof
+:: ===== end step 1: cmake =====
+
+:: ===== start step 2: Git =====
+:: Download Git-2.28.0 and add in PATH when it not installed.
+:: TODO: limit version >= 2.28.0
+:git
+echo ">>>>>>>> step [2/8]: Git 2.28.0"
+git --help > nul 2> nul || call :install_git
+goto python
+
+:install_git
+echo There is not git in this PC, will install Git-2.28.0.
+echo Download package from https://github.com/git-for-windows/git/releases/download/v2.28.0.windows.1/Git-2.28.0-64-bit.exe ...
+wget -O Git-2.28.0-64-bit.exe https://github.com/git-for-windows/git/releases/download/v2.28.0.windows.1/Git-2.28.0-64-bit.exe
+echo Install Git-2.28.0 ...
+:: /SILENT [silent install]
+:: /ALLUSERS [add path for all users]
+:: /NORESTART [do not restart]
+start /wait Git-2.28.0-64-bit.exe /SILENT /ALLUSERS /NORESTART
+if %errorlevel% == 0 (
+  echo Install Git-2.28.0 success!
+) else (
+  echo Error***** Install Git-2.28.0 failed, please re-install it manually.
+)
+del Git-2.28.0-64-bit.exe
+goto :eof
+:: ===== end step 2: Git =====
+
+:: ===== start step 3: Python =====
+:: Download Python-3.7.8 and add in PATH when it not installed.
+:: TODO: limit version >= 3.7.8
+:python
+echo ">>>>>>>> step [3/7]: Python 3.7.8"
+python -V 2>&1 | findstr /C:"Python 3.7.8" > nul 2> nul || call :install_python
+goto vs2015
+
+:install_python
+echo There is not Python in this PC, will install Python-3.7.8.
+echo Download package from https://npm.taobao.org/mirrors/python/3.7.8/python-3.7.8-amd64.exe ...
+wget -O python-3.7.8-amd64.exe https://npm.taobao.org/mirrors/python/3.7.8/python-3.7.8-amd64.exe
+echo Install Python-3.7.8 ...
+:: /passive [silent install]
+:: InstallAllUsers [add path for all users]
+:: PrependPath [add script/install into PATH]
+:: TargetDir [install directory]
+start /wait python-3.7.8-amd64.exe /passive InstallAllUsers=1 PrependPath=1 TargetDir=C:\Python37
+if %errorlevel% == 0 (
+  echo Install python-3.7.8 success!
+) else (
+  echo Error***** Install python-3.7.8 failed, please re-install it manually.
+)
+del python-3.7.8-amd64.exe
+goto :eof
+:: ===== end step 3: Python =====
+
+:: ===== start step 4: Visual Studio 2015 =====
+:: Download Visual Studio 2015 when it not installed.
+:vs2015
+echo ">>>>>>>> step [4/7]: Visual Studio 2015"
+cmd /C "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64 > nul 2> nul || call :install_visual_studio
+goto :cuda10
+
+:install_visual_studio
+echo There is not Visual Studio in this PC, will install VS2015.
+echo Download package from "https://download.my.visualstudio.com/pr/en_visual_studio_professional_2015_with_update_3_x86_x64_web_installer_8922978.exe"
+wget -O vs_installer.exe "https://download.my.visualstudio.com/pr/en_visual_studio_professional_2015_with_update_3_x86_x64_web_installer_8922978.exe?t=9ee7a96d-ca80-4b84-af2c-7dd86996a0aa&e=1600103404&h=3cdea1e81c04aa4e846f5314972c46eb&su=1"
+echo Install Visual Studio 2015 ...
+:: /passive [silent install]
+:: /norestart [no restart]
+:: /NoRefresh [no refresh]
+:: /InstallSelectableItems NativeLanguageSupport_Group [select Visual C++ for installing]
+start /wait visual_installer.exe /passive /norestart /NoRefresh /InstallSelectableItems NativeLanguageSupport_Group
+if %errorlevel% == 0 (
+  echo Install Visual Studio 2015 success!
+) else (
+  echo Error***** Install Visual Studio 2015 failed, please re-install it manually.
+)
+del vs_installer.exe
+goto :eof
+:: ===== end step 4: Visual Studio 2015 =====
+
+:: ===== start step 5: CUDA 10 =====
+:cuda10
+echo ">>>>>>>> step [5/7]: CUDA 10.0"
+nvcc --version > nul 2> nul || call :install_cuda
+goto java-jre
+
+:install_cuda
+echo There is not CUDA in this PC, will install CUDA-10.0.
+echo Download package from "https://developer.download.nvidia.cn/compute/cuda/10.0/secure/Prod/network_installers/cuda_10.0.130_win10_network.exe"
+wget -O cuda_installer.exe "https://developer.download.nvidia.cn/compute/cuda/10.0/secure/Prod/network_installers/cuda_10.0.130_win10_network.exe?hG7oBtA2CnxZG7d39onmBdtzrIa2cOukrmW8I0qk3h36vb2Sj0yYGjMElJlxlNhjx8Xu5RlbmdBhCWvP2QcEqMjCoKCXe5lOgr5uIIso_7LqrotgQHbZRZSVBYRT4bIAHPVSPrr4_4KczKvI9Nf3mbO9RJ2Vj6ECD5QphRMJBus0KKNVxO1gsplVL5qaCnE"
+echo Install CUDA-10.0 ...
+:: -s [silent install]
+start /wait cuda_installer.exe -s
+if %errorlevel% == 0 (
+  echo Install CUDA-10.0 success!
+) else (
+  echo Error***** Install CUDA-10.0 failed, please re-install it manually.
+)
+del cuda_installer.exe
+goto :eof
+:: ===== end step 5: CUDA 10 =====
+
+:: ===== start step 6: java jre =====
+:java-jre
+echo ">>>>>>>> step [6/7]: java jre"
+goto xly-agent
+:: ===== end step 6: java jre =====
+
+:: ===== start step 7: xly agent =====
+:xly-agent
+echo ">>>>>>>> step [7/7]: xly agent"
+goto :eof
+:: ===== end step 8: xly agent =====
\ No newline at end of file