Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into optimize/large_scale_kv_spped

83c2d35a · seiriosPlus · 3647704a · d4f03dfb · 83c2d35a · 83c2d35a
182 changed file
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -16,7 +16,7 @@ else()
  set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
  set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70")
  set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75")
-  set(paddle_known_gpu_archs11 "35 50 52 60 61 70 75 80")
+  set(paddle_known_gpu_archs11 "52 60 61 70 75 80")
 endif()

 ######################################################################################

--- a/cmake/external/dgc.cmake
+++ b/cmake/external/dgc.cmake
@@ -19,7 +19,7 @@ SET(DGC_SOURCES_DIR "${THIRD_PARTY_PATH}/dgc/src/extern_dgc")
 SET(DGC_INSTALL_DIR "${THIRD_PARTY_PATH}/install/dgc")
 SET(DGC_INCLUDE_DIR "${DGC_INSTALL_DIR}/include" CACHE PATH "dgc include directory." FORCE)
 SET(DGC_LIBRARIES   "${DGC_INSTALL_DIR}/lib/libdgc.a" CACHE FILEPATH "dgc library." FORCE)
-SET(DGC_URL         "http://fleet.bj.bcebos.com/collective_ef2216a.tgz")
+SET(DGC_URL         "https://fleet.bj.bcebos.com/dgc/collective_f66ef73.tgz")
 INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR})

 cache_third_party(extern_dgc
@@ -30,7 +30,7 @@ ExternalProject_Add(
    extern_dgc
    ${EXTERNAL_PROJECT_LOG_ARGS}
    "${DGC_DOWNLOAD_CMD}"
-    URL_MD5         "2f67549fd5f1262383d83289abc4f88f"
+    URL_MD5         "94e6fa1bc97169d0e1aad44570fe3251"
    PREFIX          "${DGC_PREFIX_DIR}"
    SOURCE_DIR      "${DGC_SOURCES_DIR}"
    CONFIGURE_COMMAND ""

--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
@@ -34,7 +34,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
  set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)

  if(NOT LITE_GIT_TAG)
-    set(LITE_GIT_TAG dfdfa6440c83bf0b415f9f5a9ff84842ce0bb0fa)
+    set(LITE_GIT_TAG 6d2b2a4028a58715b01887b04eb9bff8432eb184)
  endif()

  if(NOT CUDA_ARCH_NAME)

--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -19,8 +19,8 @@ SET(MKLDNN_PREFIX_DIR     ${THIRD_PARTY_PATH}/mkldnn)
 SET(MKLDNN_SOURCE_DIR     ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
-SET(MKLDNN_REPOSITORY     https://github.com/intel/mkl-dnn.git)
-SET(MKLDNN_TAG            1ea812f4f5aa1bd989372a23ab50d0f0f81ee677)
+SET(MKLDNN_REPOSITORY     https://github.com/oneapi-src/oneDNN.git)
+SET(MKLDNN_TAG            64a48f9565aa72f6359917b3406328075a409939)

 # Introduce variables:
 # * CMAKE_INSTALL_LIBDIR

--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -18,7 +18,7 @@ SET(WARPCTC_PREFIX_DIR  ${THIRD_PARTY_PATH}/warpctc)
 SET(WARPCTC_SOURCE_DIR  ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
 set(WARPCTC_REPOSITORY  https://github.com/baidu-research/warp-ctc.git)
-set(WARPCTC_TAG         bc29dcfff07ced1c7a19a4ecee48e5ad583cef8e)
+set(WARPCTC_TAG         fc7f226b93758216a03b1be9d24593a12819b984)

 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
    CACHE PATH "Warp-ctc Directory" FORCE)

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -28,7 +28,15 @@ function(CheckCompilerCXX11Flag)
 endfunction()

 CheckCompilerCXX11Flag()
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+if (WITH_GPU)
+    if (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.0)
+       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
+    else()
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+    endif()
+else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+endif()
 # safe_set_flag
 #
 # Set a compile flag only if compiler is support

--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -243,9 +243,10 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
 ENDIF()

 if(WITH_GPU)
-    include(external/cub)       # download cub
-    list(APPEND third_party_deps extern_cub)
-  
+    if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+        include(external/cub)       # download cub
+        list(APPEND third_party_deps extern_cub)
+    endif()
    set(CUDAERROR_URL  "http://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE)
    file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage
 endif(WITH_GPU)

--- a/paddle/fluid/framework/c/c_api.cc
+++ b/paddle/fluid/framework/c/c_api.cc
@@ -49,7 +49,8 @@ std::vector<std::string> PD_GetGradOpDescStrs(
    for (size_t i = 0; i < op_num; ++i) {
      PADDLE_ENFORCE_EQ(
          grad_op_descs[i]->Proto()->SerializePartialToString(&ret[i]), true,
-          "Cannot serialize message.");
+          paddle::platform::errors::Unavailable(
+              "Cannot serialize operator desc message."));
    }
  }
  return ret;

--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -36,7 +36,10 @@ message AMPConfig {
  repeated string custom_black_varnames = 9;
 }

-message LocalSGDConfig { optional int32 k_steps = 1 [ default = 4 ]; }
+message LocalSGDConfig {
+  optional int32 k_steps = 1 [ default = 1 ];
+  optional int32 begin_step = 2 [ default = 1 ];
+}

 message GradientMergeConfig {
  optional int32 k_steps = 1 [ default = 1 ];
@@ -52,6 +55,8 @@ message DGCConfig {
 message LarsConfig {
  optional float lars_coeff = 1 [ default = 0.001 ];
  optional float lars_weight_decay = 2 [ default = 0.0005 ];
+  optional float epsilon = 3 [ default = 0.0 ];
+  repeated string exclude_from_weight_decay = 4;
 }

 message LambConfig {

--- a/paddle/fluid/framework/fleet/nccl_wrapper.cc
+++ b/paddle/fluid/framework/fleet/nccl_wrapper.cc
@@ -25,7 +25,7 @@ bool NCCLWrapper::is_initialized_ = false;

 void NCCLWrapper::InitNCCL() {
 #if defined(PADDLE_WITH_NCCL)
-  PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
+  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
      &(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_,
      nccl_info_.my_global_rank_));
 #endif
@@ -41,7 +41,8 @@ void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) {

 NCCLInfo NCCLWrapper::GetNCCLId() {
 #if defined(PADDLE_WITH_NCCL)
-  PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_)));
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_)));
 #endif
  return nccl_info_;
 }
@@ -52,8 +53,8 @@ void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank,
  nccl_info_.local_rank_ = local_rank;
  nccl_info_.my_global_rank_ = global_rank;
  nccl_info_.global_ranks_ = ranks;
-  PADDLE_ENFORCE(cudaSetDevice(local_rank));
-  PADDLE_ENFORCE(cudaStreamCreate(&(nccl_info_.stream_)));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(local_rank));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&(nccl_info_.stream_)));
 #endif
  return;
 }
@@ -65,7 +66,7 @@ void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope,
    auto var = scope.FindVar(name);
    LoDTensor* tensor = var->GetMutable<LoDTensor>();
    int32_t total_size = tensor->numel();
-    PADDLE_ENFORCE(platform::dynload::ncclBcast(
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
        reinterpret_cast<void*>(tensor->data<float>()), total_size, ncclFloat,
        root_rank, nccl_info_.comm_, nccl_info_.stream_));
    cudaStreamSynchronize(nccl_info_.stream_);

--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -42,7 +42,8 @@ void ThreadPool::Init() {
      num_threads = FLAGS_dist_threadpool_size;
      VLOG(1) << "set dist_threadpool_size to " << num_threads;
    }
-    PADDLE_ENFORCE_GT(num_threads, 0);
+    PADDLE_ENFORCE_GT(num_threads, 0, platform::errors::InvalidArgument(
+                                          "The number of threads is 0."));
    threadpool_.reset(new ThreadPool(num_threads));
  }
 }
@@ -83,7 +84,8 @@ void ThreadPool::TaskLoop() {
      }

      if (tasks_.empty()) {
-        PADDLE_THROW("This thread has no task to Run");
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Current thread has no task to Run."));
      }

      // pop a task from the task queue

--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@@ -91,7 +91,8 @@ class ThreadPool {
    {
      std::unique_lock<std::mutex> lock(mutex_);
      if (!running_) {
-        PADDLE_THROW("enqueue on stopped ThreadPool");
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Task is enqueued into stopped ThreadPool."));
      }
      tasks_.push(std::move(task));
    }

--- a/paddle/fluid/framework/var_desc.cc
+++ b/paddle/fluid/framework/var_desc.cc
@@ -43,8 +43,9 @@ void VarDesc::SetTensorDescNum(size_t num) {
    } break;
    default:
      PADDLE_THROW(
-          "Setting 'sub_tensor_number' is not supported by the type of var %s.",
-          this->Name());
+          platform::errors::Unavailable("Setting 'sub_tensor_number' is not "
+                                        "supported by the %s type variable.",
+                                        this->Name()));
  }
 }

@@ -55,8 +56,9 @@ size_t VarDesc::GetTensorDescNum() const {
      break;
    default:
      PADDLE_THROW(
-          "Getting 'sub_tensor_number' is not supported by the type of var %s.",
-          this->Name());
+          platform::errors::Unavailable("Getting 'sub_tensor_number' is not "
+                                        "supported by the %s type variable.",
+                                        this->Name()));
  }
 }

@@ -133,9 +135,9 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {
      desc_.mutable_type()->mutable_tensor_array()->set_lod_level(lod_level);
      break;
    default:
-      PADDLE_THROW(
-          "Setting 'lod_level' is not supported by the type of var %s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Setting 'lod_level' is not supported by the %s type variable.",
+          this->Name()));
  }
 }

@@ -157,9 +159,9 @@ void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
      }
    } break;
    default:
-      PADDLE_THROW(
-          "Setting 'lod_levels' is not supported by the type of var %s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Setting 'lod_levels' is not supported by the %s type variable",
+          this->Name()));
  }
 }

@@ -170,9 +172,9 @@ int32_t VarDesc::GetLoDLevel() const {
    case proto::VarType::LOD_TENSOR_ARRAY:
      return desc_.type().tensor_array().lod_level();
    default:
-      PADDLE_THROW(
-          "Getting 'lod_level' is not supported by the type of var %s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Getting 'lod_level' is not supported by the %s type variable.",
+          this->Name()));
  }
 }

@@ -187,15 +189,19 @@ std::vector<int32_t> VarDesc::GetLoDLevels() const {
      return res;
      break;
    default:
-      PADDLE_THROW(
-          "Getting 'lod_levels' is not supported by the type of var %s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Getting 'lod_levels' is not supported by the %s type variable.",
+          this->Name()));
  }
 }

 const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
-  PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set.");
-  PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
+  PADDLE_ENFORCE_EQ(
+      desc_.has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
+  PADDLE_ENFORCE_EQ(
+      desc_.type().has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
  switch (desc_.type().type()) {
    case proto::VarType::SELECTED_ROWS:
      return desc_.type().selected_rows();
@@ -204,14 +210,16 @@ const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
    case proto::VarType::LOD_TENSOR_ARRAY:
      return desc_.type().tensor_array().tensor();
    default:
-      PADDLE_THROW(
-          "Getting 'tensor_desc' is not supported by the type of var %s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Getting 'tensor_desc' is not supported by the %s type variable.",
+          this->Name()));
  }
 }

 std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
-  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
+  PADDLE_ENFORCE_EQ(
+      desc_.has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
  std::vector<proto::VarType::TensorDesc> res;
  res.reserve(GetTensorDescNum());
  switch (desc_.type().type()) {
@@ -221,16 +229,19 @@ std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
      }
      return res;
    default:
-      PADDLE_THROW(
-          "Getting 'tensor_descs' is not supported by the type of var "
-          "%s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Getting 'tensor_descs' is not supported by the %s type variable.",
+          this->Name()));
  }
 }

 proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
-  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
-  PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
+  PADDLE_ENFORCE_EQ(
+      desc_.has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
+  PADDLE_ENFORCE_EQ(
+      desc_.type().has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
  switch (desc_.type().type()) {
    case proto::VarType::SELECTED_ROWS:
      return desc_.mutable_type()->mutable_selected_rows();
@@ -240,15 +251,19 @@ proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
      return desc_.mutable_type()->mutable_tensor_array()->mutable_tensor();
    default:
      PADDLE_THROW(
-          "Getting 'mutable_tensor_desc' is not supported by the type of var "
-          "%s.",
-          this->Name());
+          platform::errors::Unavailable("Getting 'mutable_tensor_desc' is not "
+                                        "supported by the %s type variable.",
+                                        this->Name()));
  }
 }

 std::vector<proto::VarType::TensorDesc *> VarDesc::mutable_tensor_descs() {
-  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
-  PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
+  PADDLE_ENFORCE_EQ(
+      desc_.has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
+  PADDLE_ENFORCE_EQ(
+      desc_.type().has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
  std::vector<proto::VarType::TensorDesc *> res;
  res.reserve(GetTensorDescNum());
  switch (desc_.type().type()) {
@@ -259,10 +274,9 @@ std::vector<proto::VarType::TensorDesc *> VarDesc::mutable_tensor_descs() {
      }
      return res;
    default:
-      PADDLE_THROW(
-          "Getting 'tensor_descs' is not supported by the type of var "
-          "%s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Getting 'tensor_descs' is not supported by the %s type variable.",
+          this->Name()));
  }
 }


--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
@@ -40,7 +40,8 @@ inline proto::VarType::Type ToVarType(int type) {
    case proto::VarType::READER:
      return static_cast<proto::VarType::Type>(type);
    default:
-      PADDLE_THROW("ToVarType:Unsupported type %d", type);
+      PADDLE_THROW(platform::errors::Unavailable(
+          "ToVarType method Unsupported type %d.", type));
  }
 }

@@ -66,7 +67,8 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
      visitor(var.Get<FetchList>());
      return;
    default:
-      PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type()));
+      PADDLE_THROW(platform::errors::Unavailable("Not supported visit type %s.",
+                                                 ToTypeName(var.Type())));
  }
 }


--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@@ -46,12 +46,14 @@ struct VarIdToTypeIndexMapInitializerImpl {
    static_assert(!std::is_same<Type, void>::value, "Type cannot be void");
    constexpr int kId = VarTypeTrait<Type>::kId;
    auto type = std::type_index(typeid(Type));
-    PADDLE_ENFORCE(id_to_type->count(kId) == 0,
-                   "Registered duplicate type id %d for type %s", kId,
-                   type.name());
-    PADDLE_ENFORCE(type_to_id->count(type) == 0,
-                   "Registered duplicate type_index %s for id %d", type.name(),
-                   kId);
+    PADDLE_ENFORCE_EQ(
+        id_to_type->count(kId), 0,
+        platform::errors::AlreadyExists(
+            "Registered duplicate type id %d for type %s.", kId, type.name()));
+    PADDLE_ENFORCE_EQ(
+        type_to_id->count(type), 0,
+        platform::errors::AlreadyExists(
+            "Registered duplicate type index %s for id %d.", type.name(), kId));
    id_to_type->emplace(kId, type);
    type_to_id->emplace(type, kId);
    VarIdToTypeIndexMapInitializerImpl<kStart + 1, kEnd,
@@ -79,15 +81,17 @@ struct VarIdToTypeIndexMapHolder {
 public:
  static const std::type_index &ToTypeIndex(int var_id) {
    auto it = Instance().id_to_type_map_.find(var_id);
-    PADDLE_ENFORCE(it != Instance().id_to_type_map_.end(),
-                   "VarId %d is not registered.", var_id);
+    PADDLE_ENFORCE_NE(it, Instance().id_to_type_map_.end(),
+                      platform::errors::NotFound(
+                          "Variable Id %d is not registered.", var_id));
    return it->second;
  }

  static int ToTypeId(const std::type_index &type) {
    auto it = Instance().type_to_id_map_.find(type);
-    PADDLE_ENFORCE(it != Instance().type_to_id_map_.end(),
-                   "VarType %s is not registered.", type.name());
+    PADDLE_ENFORCE_NE(it, Instance().type_to_id_map_.end(),
+                      platform::errors::NotFound(
+                          "Variable Type %s is not registered.", type.name()));
    return it->second;
  }


--- a/paddle/fluid/framework/variable_helper.cc
+++ b/paddle/fluid/framework/variable_helper.cc
@@ -50,11 +50,11 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
  } else if (var_type == proto::VarType::RAW) {
    // GetMutable will be called in operator
  } else {
-    PADDLE_THROW(
+    PADDLE_THROW(platform::errors::Unavailable(
        "Variable type %d is not in "
        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]",
-        var_type);
+        "LOD_RANK_TABLE, PLACE_LIST, READER, RAW].",
+        var_type));
  }
 }

@@ -76,7 +76,8 @@ void CopyVariable(const Variable &src_var, Variable *dst_var) {
    auto *dst_t = tmp_grad_slr->mutable_value();
    framework::TensorCopy(src_t, cpu_place, dst_t);
  } else {
-    PADDLE_THROW("unknown var type to copy");
+    PADDLE_THROW(
+        platform::errors::Unavailable("Unknown variable type to copy."));
  }
 }


--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -218,6 +218,10 @@ struct Argument {

  DECL_ARGUMENT_FIELD(fusion_statis, FusionStatis, fusion_statis_t);

+  // Only used in paddle-lite subgraph.
+  DECL_ARGUMENT_FIELD(cpu_math_library_num_threads, CpuMathLibraryNumThreads,
+                      int);
+
 private:
  std::unordered_set<std::string> valid_fields_;
 };

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -150,6 +150,8 @@ void IRPassManager::CreatePasses(Argument *argument,
      pass->Set("use_xpu", new bool(argument->use_xpu()));
      pass->Set("xpu_l3_workspace_size",
                new int(argument->xpu_l3_workspace_size()));
+      pass->Set("cpu_math_library_num_threads",
+                new int(argument->cpu_math_library_num_threads()));
    }
    disable_logs_ = argument->disable_logs();
    if (pass_name == "fc_fuse_pass") {

--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -244,6 +244,7 @@ void LiteSubgraphPass::SetUpEngine(
  bool enable_int8 = Get<bool>("enable_int8");
  bool use_xpu = Get<bool>("use_xpu");
  int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size");
+  int cpu_math_library_num_threads = Get<int>("cpu_math_library_num_threads");

  lite_api::TargetType target_type;
  if (use_gpu) {
@@ -263,11 +264,12 @@ void LiteSubgraphPass::SetUpEngine(
      // Notice: The ordering here determines the device where the
      // input tensor of the Lite engine is located, and then affects
      // whether tensor sharing is feasible.
-      paddle::lite::Place({target_type, precision_type}),
-      paddle::lite::Place({target_type, PRECISION(kInt64)}),
-      paddle::lite::Place({target_type, PRECISION(kFloat)}),
-      paddle::lite::Place({TARGET(kHost), PRECISION(kFloat)}),
+      paddle::lite_api::Place({target_type, precision_type}),
+      paddle::lite_api::Place({target_type, PRECISION(kInt64)}),
+      paddle::lite_api::Place({target_type, PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kHost), PRECISION(kFloat)}),
  };
+  config.cpu_math_library_num_threads = cpu_math_library_num_threads;
  config.xpu_l3_workspace_size = xpu_l3_workspace_size;
  if (dump_model) {
    lite::StrToBinaryFile("./model.bin", config.model);

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -53,12 +53,10 @@ if(WITH_TESTING)
    inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_fluid_shared
                        ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
    set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
-    set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST")
  elseif(WIN32)
    inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
                        ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
    set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
-    set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST")
  endif()

 endif()

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -461,6 +461,8 @@ void AnalysisPredictor::PrepareArgument() {
  }

  if (config_.lite_engine_enabled()) {
+    argument_.SetCpuMathLibraryNumThreads(
+        config_.cpu_math_library_num_threads());
    argument_.SetLitePrecisionMode(config_.lite_precision_mode_);
    argument_.SetLitePassesFilter(config_.lite_passes_filter_);
    argument_.SetLiteOpsFilter(config_.lite_ops_filter_);

--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -21,15 +21,21 @@
 namespace paddle {

 void ZeroCopyTensor::Reshape(const std::vector<int> &shape) {
-  PADDLE_ENFORCE(!name_.empty(),
-                 "Need to SetName first, so that the corresponding tensor can "
-                 "be retrieved.");
-  PADDLE_ENFORCE(input_or_output_,
-                 "Can't reshape the output tensor, it is readonly");
-  PADDLE_ENFORCE(scope_);
+  PADDLE_ENFORCE_EQ(
+      name_.empty(), false,
+      platform::errors::PreconditionNotMet(
+          "Need to SetName first, so that the corresponding tensor can "
+          "be retrieved."));
+  PADDLE_ENFORCE_EQ(input_or_output_, true,
+                    platform::errors::PermissionDenied(
+                        "Can't reshape the output tensor, it is readonly"));
+  PADDLE_ENFORCE_NOT_NULL(scope_, platform::errors::PreconditionNotMet(
+                                      "The scope should not be nullptr."));
  auto *scope = static_cast<framework::Scope *>(scope_);
  auto *var = scope->FindVar(name_);
-  PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::PreconditionNotMet(
+               "No tensor called [%s] in the runtime scope", name_));
  auto *tensor = var->GetMutable<framework::LoDTensor>();
  tensor->Resize(framework::make_ddim(shape));
 }
@@ -45,8 +51,10 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
  EAGER_GET_TENSOR;
  PADDLE_ENFORCE_GT(
      tensor->numel(), 0,
-      "You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)"
-      "function before retrieving mutable_data from input tensor.");
+      platform::errors::PreconditionNotMet(
+          "You should call ZeroCopyTensor::Reshape(const std::vector<int> "
+          "&shape)"
+          "function before retrieving mutable_data from input tensor."));
  switch (static_cast<int>(place)) {
    case static_cast<int>(PaddlePlace::kCPU): {
      return tensor->mutable_data<T>(platform::CPUPlace());
@@ -55,7 +63,8 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
      return tensor->mutable_data<T>(platform::CUDAPlace(device_));
    }
    default:
-      PADDLE_THROW("Unsupported place: %d", static_cast<int>(place));
+      PADDLE_THROW(platform::errors::Unavailable("Unsupported place: %d",
+                                                 static_cast<int>(place)));
      break;
  }
  return nullptr;
@@ -96,10 +105,11 @@ PaddleDType ZeroCopyTensor::type() const {
 template <typename T>
 void ZeroCopyTensor::copy_from_cpu(const T *data) {
  EAGER_GET_TENSOR;
-  PADDLE_ENFORCE_GE(
-      tensor->numel(), 0,
-      "You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)"
-      "function before copying data from cpu.");
+  PADDLE_ENFORCE_GE(tensor->numel(), 0,
+                    platform::errors::PreconditionNotMet(
+                        "You should call ZeroCopyTensor::Reshape(const "
+                        "std::vector<int> &shape)"
+                        "function before copying data from cpu."));
  size_t ele_size = tensor->numel() * sizeof(T);

  if (place_ == PaddlePlace::kCPU) {
@@ -116,7 +126,8 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) {
    memory::Copy(gpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
                 data, ele_size, dev_ctx->stream());
 #else
-    PADDLE_THROW("Not compiled with CUDA, should not reach here.");
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not compiled with CUDA, should not reach here."));
 #endif
  }
 }
@@ -141,7 +152,8 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {

    cudaStreamSynchronize(dev_ctx->stream());
 #else
-    PADDLE_THROW("Not compile with CUDA, should not reach here.");
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not compile with CUDA, should not reach here."));
 #endif
  }
 }
@@ -176,20 +188,27 @@ template PD_INFER_DECL uint8_t *ZeroCopyTensor::mutable_data<uint8_t>(
    PaddlePlace place);

 void *ZeroCopyTensor::FindTensor() const {
-  PADDLE_ENFORCE(!name_.empty(),
-                 "Need to SetName first, so that the corresponding tensor can "
-                 "be retrieved.");
-  PADDLE_ENFORCE(scope_);
+  PADDLE_ENFORCE_EQ(
+      name_.empty(), false,
+      platform::errors::PreconditionNotMet(
+          "Need to SetName first, so that the corresponding tensor can "
+          "be retrieved."));
+  PADDLE_ENFORCE_NOT_NULL(scope_, platform::errors::PreconditionNotMet(
+                                      "The scope should not be nullptr."));
  auto *scope = static_cast<framework::Scope *>(scope_);
  auto *var = scope->FindVar(name_);
-  PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::PreconditionNotMet(
+               "No tensor called [%s] in the runtime scope", name_));
  auto *tensor = var->GetMutable<framework::LoDTensor>();
  return tensor;
 }

 std::vector<int> ZeroCopyTensor::shape() const {
  EAGER_GET_TENSOR;
-  PADDLE_ENFORCE(tensor_, "not found tensor called %s in the scope", name_);
+  PADDLE_ENFORCE_NOT_NULL(
+      tensor_, platform::errors::PreconditionNotMet(
+                   "Not found tensor called %s in the scope", name_));
  return framework::vectorize<int>(tensor->dims());
 }


--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -31,12 +31,30 @@ limitations under the License. */
 #include "paddle_analysis_config.h"  // NOLINT
 #include "paddle_api.h"              // NOLINT

+///
+/// \file paddle_inference_api.h
+///
+/// \brief Paddle Inference API
+///
+/// \author paddle-infer@baidu.com
+/// \date 2020-09-01
+/// \since 2.0.0-beta
+///
+
 namespace paddle_infer {
 using DataType = paddle::PaddleDType;
 using PlaceType = paddle::PaddlePlace;
 using PrecisionType = paddle::AnalysisConfig::Precision;
 using Config = paddle::AnalysisConfig;

+///
+/// \class Tensor
+///
+/// \brief Represents an n-dimensional array of values.
+/// The Tensor is used to store the input or output of the network.
+/// It is obtained through Predictor::GetinputHandle()
+/// and Predictor::GetOutputHandle() interface.
+///
 class PD_INFER_DECL Tensor {
 public:
  // Can only be created by predictor->GetInputHandle(cosnt std::string& name)
@@ -44,33 +62,106 @@ class PD_INFER_DECL Tensor {
  Tensor() = delete;
  explicit Tensor(std::unique_ptr<paddle::ZeroCopyTensor>&& tensor)
      : tensor_(std::move(tensor)) {}
+
+  ///
+  /// \brief Reset the shape of the tensor.
+  /// Generally it's only used for the input tensor.
+  /// Reshape must be called before calling mutable_data() or CopyFromCpu()
+  /// \param shape The shape to set.
+  ///
  void Reshape(const std::vector<int>& shape);

+  ///
+  /// \brief Copy the host memory to tensor data.
+  /// It's usually used to set the input tensor data.
+  /// \param data The pointer of the data, from which the tensor will copy.
+  ///
  template <typename T>
  void CopyFromCpu(const T* data);

-  // should add the place
+  ///
+  /// \brief Get the memory pointer in CPU or GPU with specific data type.
+  /// Please Reshape the tensor first before call this.
+  /// It's usually used to get input data pointer.
+  /// \param place The place of the tensor.
+  /// \return The tensor data buffer pointer.
+  ///
  template <typename T>
  T* mutable_data(PlaceType place);

+  ///
+  /// \brief Copy the tensor data to the host memory.
+  /// It's usually used to get the output tensor data.
+  /// \param[out] data The tensor will copy the data to the address.
+  ///
  template <typename T>
  void CopyToCpu(T* data);

+  ///
+  /// \brief Get the memory pointer directly.
+  /// It's usually used to get the output data pointer.
+  /// \param[out] place To get the device type of the tensor.
+  /// \param[out] size To get the data size of the tensor.
+  /// \return The tensor data buffer pointer.
+  ///
  template <typename T>
  T* data(PlaceType* place, int* size) const;

+  ///
+  /// \brief Set lod info of the tensor.
+  /// More about LOD can be seen here:
+  ///  https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/basic_concept/lod_tensor.html#lodtensor
+  /// \param x the lod info.
+  ///
  void SetLoD(const std::vector<std::vector<size_t>>& x);
+
+  /// \brief Return the lod info of the tensor.
  std::vector<std::vector<size_t>> lod() const;

+  /// \brief Return the data type of the tensor.
+  /// It's usually used to get the output tensor data type.
+  /// \return The data type of the tensor.
  DataType type() const;

+  /// \brief Return the shape of the Tensor.
  std::vector<int> shape() const;
+
+  /// \brief Return the name of the tensor.
  const std::string& name() const;

 private:
  std::unique_ptr<paddle::ZeroCopyTensor> tensor_;
 };

+///
+/// \class Predictor
+///
+/// \brief Predictor is the interface for model prediction.
+///
+/// The predictor has the following typical uses:
+///
+/// Get predictor
+/// \code{cpp}
+///   auto predictor = CreatePredictor(config);
+/// \endcode
+///
+/// Get input or output names
+/// \code{cpp}
+///   auto input_names = predictor->GetInputNames();
+///   auto output_names = predictor->GetOutputNames();
+/// \endcode
+///
+/// Get input or output handle
+/// \code{cpp}
+///   auto input_t = predictor->GetInputHandle(input_names[0]);
+///   auto output_t = predictor->GetOutputHandle(output_names[0]);
+/// \endcode
+///
+/// Run predictor
+/// \code{cpp}
+///   predictor->Run();
+/// \endcode
+///
 class PD_INFER_DECL Predictor {
 public:
  Predictor() = delete;
@@ -79,25 +170,78 @@ class PD_INFER_DECL Predictor {
  explicit Predictor(std::unique_ptr<paddle::PaddlePredictor>&& pred)
      : predictor_(std::move(pred)) {}

+  ///
+  /// \brief Construct a new Predictor object
+  ///
+  /// \param[in] Config config
+  ///
  explicit Predictor(const Config& config);

+  ///
+  /// \brief Get the input names
+  ///
+  /// \return input names
+  ///
  std::vector<std::string> GetInputNames();
+
+  ///
+  /// \brief Get the Input Tensor object
+  ///
+  /// \param[in] name input name
+  /// \return input tensor
+  ///
  std::unique_ptr<Tensor> GetInputHandle(const std::string& name);

+  ///
+  /// \brief Run the prediction engine
+  ///
+  /// \return Whether the function executed successfully
+  ///
  bool Run();

+  ///
+  /// \brief Get the output names
+  ///
+  /// \return output names
+  ///
  std::vector<std::string> GetOutputNames();
+
+  ///
+  /// \brief Get the Output Tensor object
+  ///
+  /// \param[in] name otuput name
+  /// \return output tensor
+  ///
  std::unique_ptr<Tensor> GetOutputHandle(const std::string& name);

+  ///
+  /// \brief Clone to get the new predictor. thread safe.
+  ///
+  /// \return get a new predictor
+  ///
  std::unique_ptr<Predictor> Clone();
+
+  /// \brief Clear the intermediate tensors of the predictor
  void ClearIntermediateTensor();

 private:
  std::unique_ptr<paddle::PaddlePredictor> predictor_;
 };

+///
+/// \brief A factory to help create predictors.
+///
+/// Usage:
+///
+/// \code{.cpp}
+/// Config config;
+/// ... // change the configs.
+/// auto predictor = CreatePredictor(config);
+/// \endcode
+///
 PD_INFER_DECL std::shared_ptr<Predictor> CreatePredictor(
    const Config& config);  // NOLINT
+
 PD_INFER_DECL int GetNumBytesOfDataType(DataType dtype);

 PD_INFER_DECL std::string GetVersion();
@@ -128,13 +272,24 @@ T* Tensor::data(PlaceType* place, int* size) const {
 namespace paddle_infer {
 namespace services {

+///
+/// \class PredictorPool
+///
+/// \brief PredictorPool is a simple encapsulation of Predictor, suitable for
+/// use in multi-threaded situations. According to the thread id, the
+/// corresponding Predictor is taken out from PredictorPool to complete the
+/// prediction.
+///
 class PD_INFER_DECL PredictorPool {
 public:
  PredictorPool() = delete;
  PredictorPool(const PredictorPool&) = delete;
  PredictorPool& operator=(const PredictorPool&) = delete;

+  /// \brief Construct the predictor pool with \param size predictor instances.
  explicit PredictorPool(const Config& config, size_t size = 1);
+
+  /// \brief Get \param id-th predictor.
  Predictor* Retrive(size_t idx);

 private:

--- a/paddle/fluid/inference/capi/c_api.cc
+++ b/paddle/fluid/inference/capi/c_api.cc
@@ -16,6 +16,7 @@
 #include <vector>
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
+#include "paddle/fluid/platform/enforce.h"

 using paddle::ConvertToACPrecision;
 using paddle::ConvertToPaddleDType;
@@ -34,27 +35,37 @@ void PD_DeletePaddleBuf(PD_PaddleBuf* buf) {
 }

 void PD_PaddleBufResize(PD_PaddleBuf* buf, size_t length) {
-  PADDLE_ENFORCE_NOT_NULL(buf);
+  PADDLE_ENFORCE_NOT_NULL(buf,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of Buffer shouldn't be nullptr"));
  buf->buf.Resize(length);
 }

 void PD_PaddleBufReset(PD_PaddleBuf* buf, void* data, size_t length) {
-  PADDLE_ENFORCE_NOT_NULL(buf);
+  PADDLE_ENFORCE_NOT_NULL(buf,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of Buffer shouldn't be nullptr"));
  buf->buf.Reset(data, length);
 }

 bool PD_PaddleBufEmpty(PD_PaddleBuf* buf) {
-  PADDLE_ENFORCE_NOT_NULL(buf);
+  PADDLE_ENFORCE_NOT_NULL(buf,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of Buffer shouldn't be nullptr"));
  return buf->buf.empty();
 }

 void* PD_PaddleBufData(PD_PaddleBuf* buf) {
-  PADDLE_ENFORCE_NOT_NULL(buf);
+  PADDLE_ENFORCE_NOT_NULL(buf,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of Buffer shouldn't be nullptr"));
  return buf->buf.data();
 }

 size_t PD_PaddleBufLength(PD_PaddleBuf* buf) {
-  PADDLE_ENFORCE_NOT_NULL(buf);
+  PADDLE_ENFORCE_NOT_NULL(buf,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of Buffer shouldn't be nullptr"));
  return buf->buf.length();
 }


--- a/paddle/fluid/inference/capi/c_api_internal.h
+++ b/paddle/fluid/inference/capi/c_api_internal.h
@@ -18,7 +18,6 @@
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/api/paddle_api.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
-#include "paddle/fluid/platform/enforce.h"

 using PD_PaddleDType = paddle::PaddleDType;
 using PD_ACPrecision = paddle::AnalysisConfig::Precision;

--- a/paddle/fluid/inference/capi/pd_config.cc
+++ b/paddle/fluid/inference/capi/pd_config.cc
@@ -20,6 +20,7 @@
 #include <vector>
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
+#include "paddle/fluid/platform/enforce.h"

 using paddle::ConvertToACPrecision;
 using paddle::ConvertToPaddleDType;
@@ -40,7 +41,10 @@ void PD_DeleteAnalysisConfig(PD_AnalysisConfig* config) {
 void PD_SetModel(PD_AnalysisConfig* config, const char* model_dir,
                 const char* params_path) {
  LOG(INFO) << model_dir;
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  LOG(INFO) << std::string(model_dir);
  if (!params_path) {
    config->config.SetModel(std::string(model_dir));
@@ -50,104 +54,164 @@ void PD_SetModel(PD_AnalysisConfig* config, const char* model_dir,
 }

 void PD_SetProgFile(PD_AnalysisConfig* config, const char* x) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  config->config.SetProgFile(std::string(x));
 }

 void PD_SetParamsFile(PD_AnalysisConfig* config, const char* x) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  config->config.SetParamsFile(std::string(x));
 }

 void PD_SetOptimCacheDir(PD_AnalysisConfig* config, const char* opt_cache_dir) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  config->config.SetOptimCacheDir(std::string(opt_cache_dir));
 }

 const char* PD_ModelDir(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  return config->config.model_dir().c_str();
 }

 const char* PD_ProgFile(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  return config->config.prog_file().c_str();
 }

 const char* PD_ParamsFile(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  return config->config.params_file().c_str();
 }

 void PD_EnableUseGpu(PD_AnalysisConfig* config, int memory_pool_init_size_mb,
                     int device_id) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  config->config.EnableUseGpu(static_cast<uint64_t>(memory_pool_init_size_mb),
                              device_id);
 }

 void PD_DisableGpu(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  config->config.DisableGpu();
 }

 bool PD_UseGpu(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  return config->config.use_gpu();
 }

 int PD_GpuDeviceId(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  return config->config.gpu_device_id();
 }

 int PD_MemoryPoolInitSizeMb(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  return config->config.memory_pool_init_size_mb();
 }

 float PD_FractionOfGpuMemoryForPool(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  return config->config.fraction_of_gpu_memory_for_pool();
 }

 void PD_EnableCUDNN(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  config->config.EnableCUDNN();
 }

 bool PD_CudnnEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  return config->config.cudnn_enabled();
 }

 void PD_SwitchIrOptim(PD_AnalysisConfig* config, bool x) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  config->config.SwitchIrOptim(x);
 }

 bool PD_IrOptim(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  return config->config.ir_optim();
 }

 void PD_SwitchUseFeedFetchOps(PD_AnalysisConfig* config, bool x) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  config->config.SwitchUseFeedFetchOps(x);
 }

 bool PD_UseFeedFetchOpsEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  return config->config.use_feed_fetch_ops_enabled();
 }

 void PD_SwitchSpecifyInputNames(PD_AnalysisConfig* config, bool x) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  config->config.SwitchSpecifyInputNames(x);
 }

 bool PD_SpecifyInputName(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  return config->config.specify_input_name();
 }

@@ -155,110 +219,168 @@ void PD_EnableTensorRtEngine(PD_AnalysisConfig* config, int workspace_size,
                             int max_batch_size, int min_subgraph_size,
                             Precision precision, bool use_static,
                             bool use_calib_mode) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  config->config.EnableTensorRtEngine(
      workspace_size, max_batch_size, min_subgraph_size,
      paddle::ConvertToACPrecision(precision), use_static, use_calib_mode);
 }

 bool PD_TensorrtEngineEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  return config->config.tensorrt_engine_enabled();
 }

 void PD_SwitchIrDebug(PD_AnalysisConfig* config, bool x) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  config->config.SwitchIrDebug(x);
 }

 void PD_EnableMKLDNN(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  config->config.EnableMKLDNN();
 }

 void PD_SetMkldnnCacheCapacity(PD_AnalysisConfig* config, int capacity) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  config->config.SetMkldnnCacheCapacity(capacity);
 }

 bool PD_MkldnnEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  return config->config.mkldnn_enabled();
 }

 void PD_SetCpuMathLibraryNumThreads(PD_AnalysisConfig* config,
                                    int cpu_math_library_num_threads) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  config->config.SetCpuMathLibraryNumThreads(cpu_math_library_num_threads);
 }

 int PD_CpuMathLibraryNumThreads(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  return config->config.cpu_math_library_num_threads();
 }

 void PD_EnableMkldnnQuantizer(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  config->config.EnableMkldnnQuantizer();
 }

 bool PD_MkldnnQuantizerEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  return config->config.mkldnn_quantizer_enabled();
 }

 void PD_EnableMkldnnBfloat16(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound(
-                                      "PD_AnalysisConfig should not be null"));
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  config->config.EnableMkldnnBfloat16();
 }

 bool PD_MkldnnBfloat16Enabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::NotFound(
-                                      "PD_AnalysisConfig should not be null"));
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  return config->config.mkldnn_bfloat16_enabled();
 }

 void PD_SetModelBuffer(PD_AnalysisConfig* config, const char* prog_buffer,
                       size_t prog_buffer_size, const char* params_buffer,
                       size_t params_buffer_size) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  config->config.SetModelBuffer(prog_buffer, prog_buffer_size, params_buffer,
                                params_buffer_size);
 }

 bool PD_ModelFromMemory(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  return config->config.model_from_memory();
 }

 void PD_EnableMemoryOptim(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  config->config.EnableMemoryOptim();
 }

 bool PD_MemoryOptimEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  return config->config.enable_memory_optim();
 }

 void PD_EnableProfile(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  config->config.EnableProfile();
 }

 bool PD_ProfileEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  return config->config.profile_enabled();
 }

 void PD_SetInValid(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  config->config.SetInValid();
 }

 bool PD_IsValid(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  return config->config.is_valid();
 }


--- a/paddle/fluid/inference/capi/pd_predictor.cc
+++ b/paddle/fluid/inference/capi/pd_predictor.cc
@@ -22,6 +22,7 @@
 #include "paddle/fluid/inference/api/paddle_api.h"
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
+#include "paddle/fluid/platform/enforce.h"

 using paddle::ConvertToACPrecision;
 using paddle::ConvertToPaddleDType;
@@ -81,7 +82,10 @@ extern "C" {
 bool PD_PredictorRun(const PD_AnalysisConfig* config, PD_Tensor* inputs,
                     int in_size, PD_Tensor** output_data, int* out_size,
                     int batch_size) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  VLOG(3) << "Predoctor: PD_PredictorRun. ";
  static std::map<std::string, std::unique_ptr<paddle::PaddlePredictor>>
      predictors;
@@ -111,7 +115,10 @@ bool PD_PredictorRun(const PD_AnalysisConfig* config, PD_Tensor* inputs,
 bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config,
                             PD_ZeroCopyData* inputs, int in_size,
                             PD_ZeroCopyData** output, int* out_size) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
  static std::map<std::string, std::unique_ptr<paddle::PaddlePredictor>>
      predictors;
  if (!predictors.count(config->config.model_dir())) {
@@ -144,7 +151,8 @@ bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config,
        input_t->copy_from_cpu(static_cast<uint8_t*>(inputs[i].data));
        break;
      default:
-        CHECK(false) << "Unsupport data type.";
+        PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+            "Unsupported data type."));
        break;
    }
  }
@@ -227,7 +235,8 @@ void PD_SetZeroCopyInput(PD_Predictor* predictor,
      input->copy_from_cpu(static_cast<uint8_t*>(tensor->data.data));
      break;
    default:
-      CHECK(false) << "Unsupport data type.";
+      PADDLE_THROW(
+          paddle::platform::errors::InvalidArgument("Unsupported data type."));
      break;
  }

@@ -294,7 +303,8 @@ void PD_GetZeroCopyOutput(PD_Predictor* predictor, PD_ZeroCopyTensor* tensor) {
      output->copy_to_cpu(reinterpret_cast<uint8_t*>(tensor->data.data));
      break;
    default:
-      CHECK(false) << "Unsupport data type.";
+      PADDLE_THROW(
+          paddle::platform::errors::InvalidArgument("Unsupported data type."));
      break;
  }
 }

--- a/paddle/fluid/inference/capi/pd_tensor.cc
+++ b/paddle/fluid/inference/capi/pd_tensor.cc
@@ -19,6 +19,7 @@
 #include <vector>
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
+#include "paddle/fluid/platform/enforce.h"

 using paddle::ConvertToACPrecision;
 using paddle::ConvertToPaddleDType;
@@ -37,44 +38,60 @@ void PD_DeletePaddleTensor(PD_Tensor* tensor) {
 }

 void PD_SetPaddleTensorName(PD_Tensor* tensor, char* name) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
  tensor->tensor.name = std::string(name);
 }

 void PD_SetPaddleTensorDType(PD_Tensor* tensor, PD_DataType dtype) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
  tensor->tensor.dtype = paddle::ConvertToPaddleDType(dtype);
 }

 void PD_SetPaddleTensorData(PD_Tensor* tensor, PD_PaddleBuf* buf) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
  tensor->tensor.data = buf->buf;
 }

 void PD_SetPaddleTensorShape(PD_Tensor* tensor, int* shape, int size) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
  tensor->tensor.shape.assign(shape, shape + size);
 }

 const char* PD_GetPaddleTensorName(const PD_Tensor* tensor) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
  return tensor->tensor.name.c_str();
 }

 PD_DataType PD_GetPaddleTensorDType(const PD_Tensor* tensor) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
  return ConvertToPDDataType(tensor->tensor.dtype);
 }

 PD_PaddleBuf* PD_GetPaddleTensorData(const PD_Tensor* tensor) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
  PD_PaddleBuf* ret = PD_NewPaddleBuf();
  ret->buf = tensor->tensor.data;
  return ret;
 }

 const int* PD_GetPaddleTensorShape(const PD_Tensor* tensor, int* size) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
  const std::vector<int>& shape = tensor->tensor.shape;
  *size = shape.size();
  return shape.data();

--- a/paddle/fluid/inference/lite/engine.cc
+++ b/paddle/fluid/inference/lite/engine.cc
@@ -20,8 +20,12 @@
 #define LITE_WITH_XPU 1
 #endif

+#ifndef PADDLE_WITH_ARM
+#define LITE_WITH_X86 1
+#endif
+
 #include "paddle/fluid/inference/lite/engine.h"
-#include "lite/api/paddle_use_passes.h"
+#include <utility>

 namespace paddle {
 namespace inference {
@@ -36,32 +40,40 @@ bool EngineManager::Has(const std::string& name) const {
  return engines_.at(name).get() != nullptr;
 }

-paddle::lite::Predictor* EngineManager::Get(const std::string& name) const {
+paddle::lite_api::PaddlePredictor* EngineManager::Get(
+    const std::string& name) const {
  return engines_.at(name).get();
 }

-paddle::lite::Predictor* EngineManager::Create(const std::string& name,
-                                               const EngineConfig& cfg) {
-  if (cfg.valid_places.front().target == TARGET(kCUDA)) {
-#ifdef PADDLE_WITH_CUDA
-    paddle::lite::Env<TARGET(kCUDA)>::Init();
+paddle::lite_api::PaddlePredictor* EngineManager::Create(
+    const std::string& name, const EngineConfig& cfg) {
+  // config info for predictor.
+  paddle::lite_api::CxxConfig lite_cxx_config;
+  lite_cxx_config.set_model_buffer(cfg.model.c_str(), cfg.model.size(),
+                                   cfg.param.c_str(), cfg.param.size());
+  lite_cxx_config.set_valid_places(cfg.valid_places);
+#ifdef PADDLE_WITH_ARM
+  set_threads.set_threads(cfg.cpu_math_library_num_threads);
+#else
+  lite_cxx_config.set_x86_math_library_num_threads(
+      cfg.cpu_math_library_num_threads);
 #endif
-  } else if (cfg.valid_places.front().target == TARGET(kXPU)) {
+
 #ifdef PADDLE_WITH_XPU
-    paddle::lite::TargetWrapper<TARGET(kXPU)>::workspace_l3_size_per_thread =
-        cfg.xpu_l3_workspace_size;
+  lite_cxx_config.set_xpu_workspace_l3_size_per_thread(
+      cfg.xpu_l3_workspace_size);
 #endif
-  }
-  auto* p = new paddle::lite::Predictor();
-  p->Build("", cfg.model, cfg.param, cfg.valid_places, cfg.neglected_passes,
-           cfg.model_type, cfg.model_from_memory);
-  engines_[name].reset(p);
-  return p;
+
+  // create predictor
+  std::shared_ptr<paddle::lite_api::PaddlePredictor> p =
+      paddle::lite_api::CreatePaddlePredictor(lite_cxx_config);
+  engines_[name] = std::move(p);
+  return engines_[name].get();
 }

 void EngineManager::DeleteAll() {
  for (auto& item : engines_) {
-    item.second.reset(nullptr);
+    item.second.reset();
  }
 }


--- a/paddle/fluid/inference/lite/engine.h
+++ b/paddle/fluid/inference/lite/engine.h
@@ -23,12 +23,9 @@
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wall"
 #include "lite/api/cxx_api.h"
+#include "lite/api/paddle_api.h"
 #include "lite/api/paddle_place.h"
-#include "lite/core/context.h"
-#include "lite/core/device_info.h"
-#include "lite/core/memory.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
+#include "lite/api/paddle_use_passes.h"
 #pragma GCC diagnostic pop

 namespace paddle {
@@ -38,25 +35,33 @@ namespace lite {
 struct EngineConfig {
  std::string model;
  std::string param;
-  paddle::lite::Place prefer_place;
-  std::vector<paddle::lite::Place> valid_places;
+  std::vector<paddle::lite_api::Place> valid_places;
  std::vector<std::string> neglected_passes;
  lite_api::LiteModelType model_type{lite_api::LiteModelType::kProtobuf};
  bool model_from_memory{true};
+
+  // for xpu
  size_t xpu_l3_workspace_size;
+
+  // for x86 or arm
+  int cpu_math_library_num_threads{1};
+
+  // for cuda
+  bool use_multi_stream{false};
 };

 class EngineManager {
 public:
  bool Empty() const;
  bool Has(const std::string& name) const;
-  paddle::lite::Predictor* Get(const std::string& name) const;
-  paddle::lite::Predictor* Create(const std::string& name,
-                                  const EngineConfig& cfg);
+  paddle::lite_api::PaddlePredictor* Get(const std::string& name) const;
+  paddle::lite_api::PaddlePredictor* Create(const std::string& name,
+                                            const EngineConfig& cfg);
  void DeleteAll();

 private:
-  std::unordered_map<std::string, std::unique_ptr<paddle::lite::Predictor>>
+  std::unordered_map<std::string,
+                     std::shared_ptr<paddle::lite_api::PaddlePredictor>>
      engines_;
 };


--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/inference/lite/tensor_utils.h"
+#include <functional>
 #include <map>
 #include <memory>
 #include "paddle/fluid/framework/data_type.h"
@@ -144,16 +145,55 @@ void MemoryCopyAsync(const platform::Place& dst_place, void* dst_data,
  }
 }

-void InitDstTensor(paddle::lite::Tensor* dst, const framework::LoDTensor& src) {
+void* GetLiteTensorDataPtr(paddle::lite_api::Tensor* src,
+                           PrecisionType precision_type,
+                           TargetType target_type) {
+  void* res{nullptr};
+  switch (precision_type) {
+    case PrecisionType::kFloat:
+      res = static_cast<void*>(src->mutable_data<float>(target_type));
+      break;
+    case PrecisionType::kInt8:
+      res = static_cast<void*>(src->mutable_data<int8_t>(target_type));
+      break;
+    case PrecisionType::kInt32:
+      res = static_cast<void*>(src->mutable_data<int32_t>(target_type));
+      break;
+    case PrecisionType::kInt64:
+      res = static_cast<void*>(src->mutable_data<int64_t>(target_type));
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported precision type. Now only supports FP32, INT8, INT32 and "
+          "INT64."));
+      break;
+  }
+  return res;
+}
+
+int64_t GetLiteTensorNumel(const paddle::lite_api::Tensor& tensor) {
+  auto shape = tensor.shape();
+  int64_t numel = std::accumulate(shape.begin(), shape.end(), 1,
+                                  std::multiplies<int64_t>());
+  return numel;
+}
+
+void InitDstTensor(paddle::lite_api::Tensor* dst,
+                   const framework::LoDTensor& src) {
  // Currently, Lite needs to explicitly specify the target type of
  // the input tensor.
  constexpr int empty_size = 0;
-  dst->mutable_data(GetLiteTargetType(src.place()), empty_size);
-  dst->set_precision(GetLitePrecisionType(src.type()));
-  SetLoD(dst->mutable_lod(), src.lod());
+  dst->Resize({empty_size});
+  GetLiteTensorDataPtr(dst, GetLitePrecisionType(src.type()),
+                       GetLiteTargetType(src.place()));
+  dst->SetPrecision(GetLitePrecisionType(src.type()));
+  paddle::lite::LoD lite_lod;
+  SetLoD(&lite_lod, src.lod());
+  dst->SetLoD(lite_lod);
 }

-void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) {
+void InitDstTensor(framework::LoDTensor* dst,
+                   const paddle::lite_api::Tensor& src) {
  constexpr framework::proto::VarType::Type dtype =
      framework::proto::VarType_Type_FP32;
  dst->mutable_data(inference::lite::utils::GetNativePlace(src.target()),
@@ -162,7 +202,8 @@ void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) {
 }

 template <>
-void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src,
+void TensorCopyAsync(paddle::lite_api::Tensor* dst,
+                     const framework::LoDTensor& src,
                     const platform::DeviceContext& ctx) {
  InitDstTensor(dst, src);
  const platform::Place& src_place = src.place();
@@ -171,52 +212,56 @@ void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src,
      static_cast<size_t>(src.numel()) * framework::SizeOfType(src.type());
  dst->Resize(framework::vectorize(src.dims()));
  const void* src_data = src.data<void>();
-  void* dst_data = dst->mutable_data(bytes);
+  void* dst_data{nullptr};
+  dst_data = GetLiteTensorDataPtr(dst, GetLitePrecisionType(src.type()),
+                                  GetLiteTargetType(src.place()));
  VLOG(3) << "[CopyAsync fluid -> lite] Bytes = " << bytes << ", src = " << &src
          << ", dst = " << dst << ", src_type = " << src.type();
  MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx);
-  VLOG(3) << "[Lite memory size] Bytes = " << dst->memory_size();
+  VLOG(3) << "[Lite memory size] Bytes = " << bytes;
 }

 template <>
-void TensorCopyAsync(framework::LoDTensor* dst, const paddle::lite::Tensor& src,
+void TensorCopyAsync(framework::LoDTensor* dst,
+                     const paddle::lite_api::Tensor& src,
                     const platform::DeviceContext& ctx) {
-  dst->Resize(paddle::framework::make_ddim(src.dims().Vectorize()));
+  dst->Resize(paddle::framework::make_ddim(src.shape()));
  InitDstTensor(dst, src);
  const platform::Place& src_place = GetNativePlace(src.target());
  const platform::Place& dst_place = dst->place();
-  const size_t bytes =
-      static_cast<size_t>(src.numel()) * framework::SizeOfType(dst->type());
-  const void* src_data = src.raw_data();
+  int64_t src_numel = GetLiteTensorNumel(src);
+  const size_t bytes = src_numel * framework::SizeOfType(dst->type());
+  const void* src_data = src.data<void>();
  // When Lite is ready, the source type needs to be modified here.
  void* dst_data = dst->mutable_data(dst_place, dst->type());
  VLOG(3) << "[CopyAsync lite -> fluid] Bytes = " << bytes << ", src = " << &src
          << ", dst = " << dst << ", src_type = " << dst->type();
  MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx);
-  VLOG(3) << "[Lite memory size] Bytes = " << src.memory_size();
+  VLOG(3) << "[Lite memory size] Bytes = " << bytes;
 }

 template <>
-void TensorDataShare(paddle::lite::Tensor* dst, framework::LoDTensor* src) {
-  const size_t bytes =
-      static_cast<size_t>(src->numel()) * framework::SizeOfType(src->type());
-  auto buf = std::make_shared<paddle::lite::Buffer>(paddle::lite::Buffer(
-      src->data<void>(), GetLiteTargetType(src->place()), src->memory_size()));
+void TensorDataShare(paddle::lite_api::Tensor* dst, framework::LoDTensor* src) {
  dst->Resize(framework::vectorize(src->dims()));
-  dst->set_precision(GetLitePrecisionType(src->type()));
-  SetLoD(dst->mutable_lod(), src->lod());
-  dst->ResetBuffer(buf, bytes);
+  dst->ShareExternalMemory(src->data<void>(), src->memory_size(),
+                           GetLiteTargetType(src->place()));
+  dst->SetPrecision(GetLitePrecisionType(src->type()));
+  paddle::lite::LoD lite_lod;
+  SetLoD(&lite_lod, src->lod());
+  dst->SetLoD(lite_lod);
 }

 template <>
-void TensorDataShare(framework::LoDTensor* dst, paddle::lite::Tensor* src) {
+void TensorDataShare(framework::LoDTensor* dst, paddle::lite_api::Tensor* src) {
  constexpr framework::proto::VarType::Type dtype =
      framework::proto::VarType_Type_FP32;
-  void* src_raw_data = src->raw_data();
+  void* src_raw_data =
+      GetLiteTensorDataPtr(src, GetLitePrecisionType(dtype), src->target());
+  size_t memory_size = GetLiteTensorNumel(*src) * sizeof(float);
  std::shared_ptr<memory::allocation::Allocation> holder(
-      new memory::allocation::Allocation(src_raw_data, src->memory_size(),
+      new memory::allocation::Allocation(src_raw_data, memory_size,
                                         GetNativePlace(src->target())));
-  dst->Resize(paddle::framework::make_ddim(src->dims().Vectorize()));
+  dst->Resize(paddle::framework::make_ddim(src->shape()));
  SetLoD(dst->mutable_lod(), src->lod());
  dst->ResetHolderWithType(holder, dtype);
 }

--- a/paddle/fluid/inference/lite/test_engine.cc
+++ b/paddle/fluid/inference/lite/test_engine.cc
@@ -102,10 +102,10 @@ TEST(EngineManager, engine) {
  config.model_from_memory = true;
  config.valid_places = {
 #ifdef PADDLE_WITH_CUDA
-      paddle::lite::Place({TARGET(kCUDA), PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kCUDA), PRECISION(kFloat)}),
 #endif
-      paddle::lite::Place({TARGET(kX86), PRECISION(kFloat)}),
-      paddle::lite::Place({TARGET(kHost), PRECISION(kAny)}),
+      paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}),
  };

  LOG(INFO) << "Create EngineManager";
@@ -118,7 +118,7 @@ TEST(EngineManager, engine) {
  ASSERT_EQ(inference::Singleton<inference::lite::EngineManager>::Global().Has(
                unique_key),
            true);
-  paddle::lite::Predictor* engine_0 =
+  paddle::lite_api::PaddlePredictor* engine_0 =
      inference::Singleton<inference::lite::EngineManager>::Global().Get(
          unique_key);
  CHECK_NOTNULL(engine_0);

--- a/paddle/fluid/inference/lite/test_tensor_utils.cc
+++ b/paddle/fluid/inference/lite/test_tensor_utils.cc
@@ -73,6 +73,33 @@ TEST(LiteEngineOp, GetNativeLayoutType) {
  EXPECT_ANY_THROW(GetNativeLayoutType(DataLayoutType::kNHWC));
 }

+template <typename T>
+void test_lite_tensor_data_ptr(PrecisionType precision_type) {
+  void* GetLiteTensorDataPtr(paddle::lite_api::Tensor * src,
+                             PrecisionType precision_type,
+                             TargetType target_type);
+  const int count = 4;
+  paddle::lite::Tensor lite_tensor;
+  lite_tensor.Resize({count});
+  auto* lite_tensor_data = lite_tensor.mutable_data<T>();
+  for (size_t i = 0; i < count; ++i) {
+    lite_tensor_data[i] = i;
+  }
+  paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
+  T* data = static_cast<T*>(GetLiteTensorDataPtr(
+      &lite_api_tensor, precision_type, TargetType::kHost));
+  for (size_t i = 0; i < count; ++i) {
+    CHECK_EQ(data[i], static_cast<T>(i)) << "the i-th num is not correct.";
+  }
+}
+
+TEST(LiteEngineOp, GetLiteTensorDataPtr) {
+  test_lite_tensor_data_ptr<int64_t>(PrecisionType::kInt64);
+  test_lite_tensor_data_ptr<int32_t>(PrecisionType::kInt32);
+  test_lite_tensor_data_ptr<int8_t>(PrecisionType::kInt8);
+  EXPECT_ANY_THROW(test_lite_tensor_data_ptr<double>(PrecisionType::kUnk));
+}
+
 void test_tensor_copy(const platform::DeviceContext& ctx) {
  // Create LoDTensor.
  std::vector<float> vector({1, 2, 3, 4});
@@ -83,10 +110,11 @@ void test_tensor_copy(const platform::DeviceContext& ctx) {
  lod_tensor.set_lod(lod);
  // Create lite::Tensor and copy.
  paddle::lite::Tensor lite_tensor;
-  TensorCopyAsync(&lite_tensor, lod_tensor, ctx);
+  paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
+  TensorCopyAsync(&lite_api_tensor, lod_tensor, ctx);
  // Copy to LoDTensor.
  framework::LoDTensor lod_tensor_n;
-  TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx);
+  TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx);
 #ifdef PADDLE_WITH_CUDA
  if (platform::is_gpu_place(ctx.GetPlace())) {
    platform::GpuStreamSync(
@@ -108,10 +136,11 @@ void test_tensor_share(const platform::DeviceContext& ctx) {
  lod_tensor.set_lod(lod);
  // Create lite::Tensor and share.
  paddle::lite::Tensor lite_tensor;
-  TensorDataShare(&lite_tensor, &lod_tensor);
+  paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
+  TensorDataShare(&lite_api_tensor, &lod_tensor);
  // Copy to LoDTensor.
  framework::LoDTensor lod_tensor_n;
-  TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx);
+  TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx);
  std::vector<float> result;
  TensorToVector(lod_tensor_n, ctx, &result);
  ASSERT_EQ(result, vector);

--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -63,11 +63,13 @@ void TensorRTEngine::Execute(int batch_size, std::vector<void *> *buffers,
 void TensorRTEngine::FreezeNetwork() {
  freshDeviceId();
  VLOG(3) << "TRT to freeze network";
-  PADDLE_ENFORCE(infer_builder_ != nullptr,
-                 "Call InitNetwork first to initialize network.");
-  PADDLE_ENFORCE_EQ(network() != nullptr, true,
-                    platform::errors::InvalidArgument(
-                        "Call InitNetwork first to initialize network."));
+  PADDLE_ENFORCE_NOT_NULL(infer_builder_,
+                          platform::errors::InvalidArgument(
+                              "Inference builder of TRT is null. Please make "
+                              "sure you call InitNetwork first."));
+  PADDLE_ENFORCE_NOT_NULL(network(),
+                          platform::errors::InvalidArgument(
+                              "Call InitNetwork first to initialize network."));
  // build engine.
  infer_builder_->setMaxBatchSize(max_batch_);
  infer_builder_->setMaxWorkspaceSize(max_workspace_);
@@ -210,7 +212,10 @@ void TensorRTEngine::FreezeNetwork() {
  } else {
    infer_engine_.reset(infer_builder_->buildCudaEngine(*network()));
  }
-  PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!");
+  PADDLE_ENFORCE_NOT_NULL(
+      infer_engine_, platform::errors::Fatal(
+                         "Build TensorRT cuda engine failed! Please recheck "
+                         "you configurations related to paddle-TensorRT."));
 }

 nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
@@ -220,8 +225,16 @@ nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
                    platform::errors::InvalidArgument(
                        "The TRT network should be initialized first."));
  auto *input = network()->addInput(name.c_str(), dtype, dims);
-  PADDLE_ENFORCE(input, "infer network add input %s failed", name);
-  PADDLE_ENFORCE(input->isNetworkInput());
+  PADDLE_ENFORCE_NOT_NULL(
+      input, platform::errors::InvalidArgument("Adding input %s failed in "
+                                               "TensorRT inference network. "
+                                               "Please recheck your input.",
+                                               name));
+  PADDLE_ENFORCE_EQ(input->isNetworkInput(), true,
+                    platform::errors::InvalidArgument(
+                        "Input %s is not the input of TRT inference network. "
+                        "Please recheck your input.",
+                        name));
  TensorRTEngine::SetITensor(name, input);
  return input;
 }
@@ -230,31 +243,53 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
                                   const std::string &name) {
  auto *output = layer->getOutput(offset);
  SetITensor(name, output);
-  PADDLE_ENFORCE(output != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(
+      output, platform::errors::InvalidArgument(
+                  "The output %s of TRT engine should not be null.", name));
  output->setName(name.c_str());
-  PADDLE_ENFORCE(!output->isNetworkInput());
+  PADDLE_ENFORCE_EQ(output->isNetworkInput(), false,
+                    platform::errors::InvalidArgument(
+                        "The output %s of TRT engine should not be the input "
+                        "of the network at the same time.",
+                        name));
  network()->markOutput(*output);
-  PADDLE_ENFORCE(output->isNetworkOutput());
+  PADDLE_ENFORCE_EQ(
+      output->isNetworkOutput(), true,
+      platform::errors::InvalidArgument(
+          "The output %s of TRT engine should be the output of the network.",
+          name));
 }

 void TensorRTEngine::DeclareOutput(const std::string &name) {
  auto *output = TensorRTEngine::GetITensor(name);
-  PADDLE_ENFORCE(output != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(
+      output, platform::errors::InvalidArgument(
+                  "The output %s of TRT engine should not be null.", name));
  output->setName(name.c_str());
-  PADDLE_ENFORCE(!output->isNetworkInput());
+  PADDLE_ENFORCE_EQ(output->isNetworkInput(), false,
+                    platform::errors::InvalidArgument(
+                        "The output %s of TRT engine should not be the input "
+                        "of the network at the same time.",
+                        name));
  network()->markOutput(*output);
 }

 void TensorRTEngine::SetITensor(const std::string &name,
                                nvinfer1::ITensor *tensor) {
-  PADDLE_ENFORCE(tensor != nullptr);
-  PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s",
-                    name);
+  PADDLE_ENFORCE_NOT_NULL(
+      tensor, platform::errors::InvalidArgument(
+                  "Tensor named %s of TRT engine should not be null.", name));
+  PADDLE_ENFORCE_EQ(
+      0, itensor_map_.count(name),
+      platform::errors::InvalidArgument(
+          "Tensor named %s of TRT engine should not be duplicated", name));
  itensor_map_[name] = tensor;
 }

 nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
-  PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name);
+  PADDLE_ENFORCE_EQ(itensor_map_.count(name), true,
+                    platform::errors::NotFound(
+                        "Tensor named %s is not found in TRT engine", name));
  return itensor_map_[name];
 }

@@ -271,11 +306,11 @@ float *TensorRTEngine::GetWeightCPUData(const std::string &name,
  std::string splitter = "__";
  std::string name_with_suffix = name + splitter + name_suffix;
  platform::CPUPlace cpu_place;
-  PADDLE_ENFORCE_EQ(
-      weight_map.count(name_with_suffix), 0,
-      "During TRT Op converter: We set weight %s with the same name "
-      "twice into the weight_map",
-      name_with_suffix);
+  PADDLE_ENFORCE_EQ(weight_map.count(name_with_suffix), 0,
+                    platform::errors::AlreadyExists(
+                        "The weight named %s is set into the weight map "
+                        "twice in TRT OP converter.",
+                        name_with_suffix));
  weight_map[name_with_suffix].reset(new framework::Tensor());
  weight_map[name_with_suffix]->Resize(weight_tensor->dims());
  TensorCopySync(*weight_tensor, cpu_place, weight_map[name_with_suffix].get());
@@ -297,7 +332,10 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
 void TensorRTEngine::freshDeviceId() {
  int count;
  cudaGetDeviceCount(&count);
-  PADDLE_ENFORCE_LT(device_id_, count);
+  PADDLE_ENFORCE_LT(device_id_, count,
+                    platform::errors::OutOfRange(
+                        "Device id %d exceeds the current device count: %d.",
+                        device_id_, count));
  cudaSetDevice(device_id_);
 }


--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -196,8 +196,10 @@ class TensorRTEngine {
  }

  nvinfer1::IHostMemory* Serialize() {
-    PADDLE_ENFORCE(infer_engine_ != nullptr,
-                   "You should build engine first and then serialize");
+    PADDLE_ENFORCE_NOT_NULL(
+        infer_engine_,
+        platform::errors::InvalidArgument(
+            "The TensorRT engine must be built first before serialization"));
    ihost_memory_.reset(infer_engine_->serialize());
    return ihost_memory_.get();
  }
@@ -222,8 +224,14 @@ class TensorRTEngine {
          engine_serialized_data.c_str(), engine_serialized_data.size(),
          &inference::Singleton<plugin::PluginFactoryTensorRT>::Global()));
    }
-    PADDLE_ENFORCE(infer_engine_ != nullptr,
-                   "build cuda engine failed when deserialize engine info.!");
+    PADDLE_ENFORCE_NOT_NULL(
+        infer_engine_,
+        platform::errors::Fatal(
+            "Building TRT cuda engine failed when deserializing engine info. "
+            "Please check:\n1. Your TRT serialization is generated and loaded "
+            "on the same GPU architecture;\n2. The Paddle Inference version of "
+            "generating serialization file and doing inference are "
+            "consistent."));
  }

  void SetRuntimeBatch(size_t batch_size);

--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
@@ -56,14 +56,27 @@ __global__ void elementwise_kernel(const size_t total, const T *x_data,

 nvinfer1::Dims ElementWisePlugin::getOutputDimensions(
    int index, const nvinfer1::Dims *input_dims, int num_inputs) {
-  PADDLE_ENFORCE_EQ(index, 0);
-  PADDLE_ENFORCE_EQ(num_inputs, 2);
-  PADDLE_ENFORCE_NOT_NULL(input_dims);
+  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
+                                  "There is only one output in TRT elementwise "
+                                  "op plugin, but got output index: %d.",
+                                  index));
+  PADDLE_ENFORCE_EQ(num_inputs, 2, platform::errors::InvalidArgument(
+                                       "There are 2 inputs in TRT elementwise "
+                                       "op plugin, but got input number: %d.",
+                                       num_inputs));
+  PADDLE_ENFORCE_NOT_NULL(
+      input_dims,
+      platform::errors::InvalidArgument(
+          "The input dims of TRT elementwise op plugin should not be null."));
  return input_dims[0];
 }

 int ElementWisePlugin::initialize() {
-  PADDLE_ENFORCE_GT(dims_y_.nbDims, 0);
+  PADDLE_ENFORCE_GT(dims_y_.nbDims, 0,
+                    platform::errors::InvalidArgument(
+                        "The dimension of input Y of TRT elementwise op plugin "
+                        "should be greater than 0, but got %d.",
+                        dims_y_.nbDims));

  axis_ = (axis_ == -1) ? dims_x_.nbDims - dims_y_.nbDims : axis_;
  int trimed_nb_dims = dims_y_.nbDims;
@@ -74,8 +87,18 @@ int ElementWisePlugin::initialize() {
  }
  dims_y_.nbDims = trimed_nb_dims;

-  PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_);
-  PADDLE_ENFORCE_LT(axis_, dims_x_.nbDims);
+  PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_,
+                    platform::errors::InvalidArgument(
+                        "We expect [number of x dims] >= [number of y dims + "
+                        "axis] in TRT elementwise op plugin, but got [number "
+                        "of x dims] = %d, [number of y dims + axis] = %d.",
+                        dims_x_.nbDims, dims_y_.nbDims + axis_));
+  PADDLE_ENFORCE_LT(
+      axis_, dims_x_.nbDims,
+      platform::errors::InvalidArgument("We expect [axis] < [number of x dims] "
+                                        "in TRT elementwise op plugin, but got "
+                                        "[axis] = %d, [number of x dims] = %d.",
+                                        axis_, dims_x_.nbDims));

  prev_size_ = 1;
  midd_size_ = 1;
@@ -86,7 +109,9 @@ int ElementWisePlugin::initialize() {

  for (int i = 0; i < dims_y_.nbDims; ++i) {
    PADDLE_ENFORCE_EQ(dims_x_.d[i + axis_], dims_y_.d[i],
-                      "Broadcast dimension mismatch.");
+                      platform::errors::InvalidArgument(
+                          "Broadcast dimension mismatch. The dims of input Y "
+                          "should be a subsequence of X."));
    midd_size_ *= dims_y_.d[i];
  }

@@ -221,7 +246,10 @@ int ElementwisePluginDynamic::enqueue(
    elementwise_kernel<<<block, thread, 0, stream>>>(
        num, x, y, out, prev_size, midd_size, post_size, details::Mul<float>());
  } else {
-    PADDLE_THROW("Not implemented.");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Paddle-TRT only support elementwise operation: {add, mul} currently, "
+        "but got %s.",
+        type_));
  }

  return cudaGetLastError() != cudaSuccess;

--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -74,7 +74,9 @@ TEST_F(TensorRTEngineTest, add_layer) {
                                  nvinfer1::DimsCHW{1, 1, 1});
  auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size,
                                        weight.get(), bias.get());
-  PADDLE_ENFORCE(fc_layer != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(fc_layer,
+                          platform::errors::InvalidArgument(
+                              "TRT fully connected layer building failed."));

  engine_->DeclareOutput(fc_layer, 0, "y");
  LOG(INFO) << "freeze network";
@@ -116,7 +118,9 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
                                  nvinfer1::DimsCHW{1, 2, 1});
  auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2,
                                        weight.get(), bias.get());
-  PADDLE_ENFORCE(fc_layer != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(fc_layer,
+                          platform::errors::InvalidArgument(
+                              "TRT fully connected layer building failed."));

  engine_->DeclareOutput(fc_layer, 0, "y");
  engine_->FreezeNetwork();
@@ -160,7 +164,9 @@ TEST_F(TensorRTEngineTest, test_conv2d) {
  auto *conv_layer =
      TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3},
                           weight.get(), bias.get());
-  PADDLE_ENFORCE(conv_layer != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(conv_layer,
+                          platform::errors::InvalidArgument(
+                              "TRT convolution layer building failed."));
  conv_layer->setStride(nvinfer1::DimsHW{1, 1});
  conv_layer->setPadding(nvinfer1::DimsHW{1, 1});

@@ -199,7 +205,9 @@ TEST_F(TensorRTEngineTest, test_pool2d) {
  auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *x, pool_t,
                                          nvinfer1::DimsHW{2, 2});

-  PADDLE_ENFORCE(pool_layer != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(
+      pool_layer,
+      platform::errors::InvalidArgument("TRT pooling layer building failed."));
  pool_layer->setStride(nvinfer1::DimsHW{1, 1});
  pool_layer->setPadding(nvinfer1::DimsHW{0, 0});


--- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
+++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
@@ -83,9 +83,8 @@ bool TRTInt8Calibrator::setBatch(
          engine_name_, it.first));
    }
    const auto& d = dataptr->second;
-    PADDLE_ENFORCE(
-        cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice),
-        "Fail to cudaMemcpy %s for %s", engine_name_, it.first);
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice));
  }

  data_is_set_ = true;

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -342,9 +342,9 @@ if(WITH_MKLDNN)
  ### Lexcial analysis GRU model
  set(GRU_PATH "${INFERENCE_DEMO_INSTALL_DIR}/gru")
  download_GRU_data("${GRU_PATH}" "GRU_eval_data.tar.gz")
-  download_GRU_data("${GRU_PATH}" "GRU_eval_model.tar.gz")
+  download_GRU_data("${GRU_PATH}" "GRU_eval_model_v2.tar.gz")
  set(GRU_DATA_PATH "${GRU_PATH}/GRU_eval_data.bin")
-  set(GRU_MODEL_PATH "${GRU_PATH}/GRU_eval_model")
+  set(GRU_MODEL_PATH "${GRU_PATH}/GRU_eval_model_v2")
  set(LEXICAL_TEST_APP "test_analyzer_lexical_analysis")
  set(LEXICAL_TEST_APP_SRC "analyzer_lexical_analysis_gru_tester.cc")


--- a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
@@ -27,7 +27,7 @@ TEST(AnalysisPredictor, use_gpu) {
  AnalysisConfig config;
  config.EnableUseGpu(100, 0);
  config.SetModel(model_dir + "/model", model_dir + "/params");
-  config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
+  config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32, true);

  std::vector<PaddleTensor> inputs;
  auto predictor = CreatePaddlePredictor(config);

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -45,7 +45,9 @@ endif()
 SET(OP_HEADER_DEPS xxhash executor)

 if (WITH_GPU)
-    SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
+    if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+        SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
+    endif()
 endif()

 SET(OP_PREFETCH_DEPS "")

--- a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc
+++ b/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc
@@ -12,32 +12,30 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h"
-
-#include <string>
-#include <vector>
+#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
+#include "paddle/fluid/framework/tensor_util.h"

 namespace paddle {
 namespace operators {

-class AmpCheckFiniteAndScaleOp : public framework::OperatorWithKernel {
+class CheckFiniteAndUnscaleOp : public framework::OperatorWithKernel {
 public:
-  AmpCheckFiniteAndScaleOp(const std::string &type,
-                           const framework::VariableNameMap &inputs,
-                           const framework::VariableNameMap &outputs,
-                           const framework::AttributeMap &attrs)
+  CheckFiniteAndUnscaleOp(const std::string& type,
+                          const framework::VariableNameMap& inputs,
+                          const framework::VariableNameMap& outputs,
+                          const framework::AttributeMap& attrs)
      : OperatorWithKernel(type, inputs, outputs, attrs) {}

-  void InferShape(framework::InferShapeContext *ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X",
-                   "amp_check_finite_and_unscale");
+                   "check_finite_and_unscale");
    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
-                   "amp_check_finite_and_unscale");
+                   "check_finite_and_unscale");
    PADDLE_ENFORCE_EQ(
        ctx->Inputs("X").size(), ctx->Outputs("Out").size(),
        platform::errors::InvalidArgument(
            "The input(X) and output(Out) should have same size in "
-            "Operator(amp_check_finite_and_unscale), size of input(X) is %d "
+            "Operator(check_finite_and_unscale), size of input(X) is %d "
            "and size of output(Out) is %d.",
            ctx->Inputs("X").size(), ctx->Outputs("Out").size()));
    auto x_dims = ctx->GetInputsDim("X");
@@ -47,34 +45,34 @@ class AmpCheckFiniteAndScaleOp : public framework::OperatorWithKernel {

 protected:
  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
+      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
  }
 };

-class AmpCheckFiniteAndScaleOpMaker : public framework::OpProtoAndCheckerMaker {
+class CheckFiniteAndUnscaleOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
    AddInput(
        "X",
-        "(Tensors) The input tensors of amp_check_finite_and_scale operator.")
+        "(Tensors) The input tensors of check_finite_and_unscale operator.")
        .AsDuplicable();
    AddInput("Scale",
-             "(Tensor) 1-dim tensor, the scale of amp_check_finite_and_scale "
+             "(Tensor) 1-dim tensor, the scale of check_finite_and_unscale "
             "operator.");
    AddOutput("Out",
              "(Tensors) The scaled output tensor of "
-              "amp_check_finite_and_unscale operator.")
+              "check_finite_and_unscale operator.")
        .AsDuplicable();
    AddOutput("FoundInfinite",
              "(Tensor) 1-dim tensor, contains a bool scalar, which indicates "
              "if there there is infinite or nan item in input X.");
    AddComment(R"DOC(
-amp_check_finite_and_scale operator.
+check_finite_and_unscale operator.
 Check if input X contains all finite data, if yes, scale it by input Scale.

-$$Out = X * scale$$
+$$Out = X / scale$$

 If any tensor in X contains Inf or Nan, the Out will generate a indicator.
 FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of 
@@ -85,20 +83,59 @@ Otherwise, FoundInfinite will be 0 (False).
  }
 };

+template <typename T>
+class CheckFiniteAndUnscaleCpuKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+    const auto xs = ctx.MultiInput<framework::Tensor>("X");
+    const auto* scale = ctx.Input<framework::Tensor>("Scale");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
+
+    const T* scale_data = scale->data<T>();
+    bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
+
+    *found_inf_data = false;
+    framework::Tensor is_finite =
+        ctx.AllocateTmpTensor<bool, platform::CPUDeviceContext>({1}, dev_ctx);
+    bool* is_finite_data = is_finite.template data<bool>();
+
+    auto& dev = *ctx.template device_context<platform::CPUDeviceContext>()
+                     .eigen_device();
+
+    T inverse_scale = Inverse<T>(*scale_data);
+    for (size_t i = 0; i < xs.size(); ++i) {
+      const auto* x = xs[i];
+      auto* out = outs[i];
+      out->mutable_data<T>(dev_ctx.GetPlace());
+      if (!(*found_inf_data)) {
+        framework::TensorIsfinite(*x, &is_finite);
+        *found_inf_data = !(*is_finite_data);
+      }
+      auto eigen_out = framework::EigenVector<T>::Flatten(*out);
+      auto eigen_in = framework::EigenVector<T>::Flatten(*x);
+      if (!(*found_inf_data)) {
+        eigen_out.device(dev) = eigen_in * inverse_scale;
+      } else {
+        eigen_out.device(dev) = eigen_in * static_cast<T>(0);
+      }
+    }
+    return;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

 namespace ops = paddle::operators;

 REGISTER_OPERATOR(
-    amp_check_finite_and_scale, ops::AmpCheckFiniteAndScaleOp,
-    ops::AmpCheckFiniteAndScaleOpMaker,
+    check_finite_and_unscale, ops::CheckFiniteAndUnscaleOp,
+    ops::CheckFiniteAndUnscaleOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);

-REGISTER_OP_CPU_KERNEL(
-    amp_check_finite_and_scale,
-    ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CPUDeviceContext,
-                                      float>,
-    ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CPUDeviceContext,
-                                      double>);
+REGISTER_OP_CPU_KERNEL(check_finite_and_unscale,
+                       ops::CheckFiniteAndUnscaleCpuKernel<float>,
+                       ops::CheckFiniteAndUnscaleCpuKernel<double>);
--- a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu
+++ b/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu
@@ -14,28 +14,31 @@ limitations under the License. */

 #include <cuda.h>

-#include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"

 namespace paddle {
 namespace operators {

 template <typename T>
-__global__ void AmpCheckFiniteAndScale(const T* in, const T* scale, int num,
-                                       bool* found_inf, T* out) {
+__global__ void GpuInverse(const T* s, T* o) {
+  *o = Inverse<T>(*s);
+}
+
+template <typename T>
+__global__ void CheckFiniteAndUnscale(const T* in, const T* scale, int num,
+                                      bool* found_inf, T* out) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;

  if (idx < num) {
    if (!isfinite(in[idx])) {
-      *found_inf = 1;
+      *found_inf = true;
    }
-    out[idx] = *found_inf ? in[idx] : in[idx] * scale[0];
+    out[idx] = *found_inf ? in[idx] : in[idx] * (*scale);
  }
 }

 template <typename T>
-class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
+class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const {
    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
@@ -48,6 +51,12 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
    bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
    cudaMemset(found_inf_data, false, found_inf->numel() * sizeof(bool));

+    framework::Tensor inverse_scale =
+        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({1}, dev_ctx);
+    T* inverse_scale_v = inverse_scale.template data<T>();
+
+    GpuInverse<T><<<1, 1, 0, dev_ctx.stream()>>>(scale_data, inverse_scale_v);
+
    for (size_t i = 0; i < xs.size(); ++i) {
      const auto* x = xs[i];
      auto* out = outs[i];
@@ -55,11 +64,11 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
      T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());

      int num = x->numel();
-      int block = 512;
+      int block = 1024;
      int grid = (num + block - 1) / block;
      VLOG(3) << "launch kernel";
-      AmpCheckFiniteAndScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
-          x_data, scale_data, num, found_inf_data, out_data);
+      CheckFiniteAndUnscale<T><<<grid, block, 0, dev_ctx.stream()>>>(
+          x_data, inverse_scale_v, num, found_inf_data, out_data);
      VLOG(3) << "finish kernel";
    }
  }
@@ -68,9 +77,6 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    amp_check_finite_and_scale,
-    ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CUDADeviceContext,
-                                      float>,
-    ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CUDADeviceContext,
-                                      double>);
+REGISTER_OP_CUDA_KERNEL(check_finite_and_unscale,
+                        ops::CheckFiniteAndUnscaleGpuKernel<float>,
+                        ops::CheckFiniteAndUnscaleGpuKernel<double>);
--- a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h
+++ b/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h
@@ -16,51 +16,16 @@ limitations under the License. */

 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/isfinite_op.h"
+#include "paddle/fluid/platform/hostdevice.h"

 namespace paddle {
 namespace operators {

-template <typename DeviceContext, typename T>
-class AmpCheckFiniteAndScaleKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    const auto xs = ctx.MultiInput<framework::Tensor>("X");
-    const auto* scale = ctx.Input<framework::Tensor>("Scale");
-    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
-
-    const T* scale_data = scale->data<T>();
-    bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
-
-    *found_inf_data = false;
-    framework::Tensor is_finite =
-        ctx.AllocateTmpTensor<bool, DeviceContext>({1}, dev_ctx);
-    bool* is_finite_data = is_finite.template data<bool>();
-
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    for (size_t i = 0; i < xs.size(); ++i) {
-      const auto* x = xs[i];
-      auto* out = outs[i];
-      out->mutable_data<T>(dev_ctx.GetPlace());
-      if (!(*found_inf_data)) {
-        framework::TensorIsfinite(*x, &is_finite);
-        if (*is_finite_data) {
-          auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-          auto eigen_in = framework::EigenVector<T>::Flatten(*x);
-          eigen_out.device(dev) = (*scale_data) * eigen_in;
-        } else {
-          *found_inf_data = true;
-          break;
-        }
-      }
-    }
-    return;
-  }
-};
+template <typename T>
+inline HOSTDEVICE T Inverse(T s) {
+  return 1.0 / s;
+}

 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
+#include <cstring>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class UpdateLossScalingOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasInput("FoundInfinite"), "Input", "FoundInfinite",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasInput("PrevLossScaling"), "Input", "PrevLossScaling",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasInput("InGoodSteps"), "Input", "InGoodSteps",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasInput("InBadSteps"), "Input", "InBadSteps",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasOutput("LossScaling"), "Output", "LossScaling",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasOutput("OutGoodSteps"), "Output", "OutGoodSteps",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasOutput("OutBadSteps"), "Output", "OutBadSteps",
+                   "update_loss_scaling");
+    auto x_dims = ctx->GetInputsDim("X");
+    ctx->SetOutputsDim("Out", x_dims);
+    ctx->SetOutputDim("LossScaling", {1});
+    ctx->SetOutputDim("OutGoodSteps", {1});
+    ctx->SetOutputDim("OutBadSteps", {1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "PrevLossScaling"),
+        ctx.device_context());
+  }
+};
+
+class UpdateLossScalingOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensors) The input tensors of update_loss_scaling operator.")
+        .AsDuplicable();
+    AddInput("FoundInfinite",
+             "(Tensor) 1-dim tensor, contains a bool scalar, which indicates "
+             "whether there is any infinite gradient.");
+    AddInput("PrevLossScaling",
+             "(Tensor) 1-dim tensor, previous loss scaling.");
+    AddInput("InGoodSteps",
+             "(Tensor) 1-dim tensor, accumulates good steps in which all "
+             "gradients are finite.");
+    AddInput("InBadSteps",
+             "(Tensor) 1-dim tensor, accumulates bad steps in which some "
+             "gradients are infinite.");
+    AddOutput("Out",
+              "(Tensors) The output tensor of update_loss_scaling operator.")
+        .AsDuplicable();
+    AddOutput("LossScaling", "(Tensor) 1-dim tensor, updated loss scaling.");
+    AddOutput("OutGoodSteps", "(Tensor) 1-dim tensor, pdated good steps.");
+    AddOutput("OutBadSteps", "(Tensor) 1-dim tensor, updated bad steps.");
+    AddAttr<int>("incr_every_n_steps",
+                 "A value represents increasing loss scaling every n "
+                 "consecutive steps with finite gradients.");
+    AddAttr<int>("decr_every_n_nan_or_inf",
+                 "A value represents decreasing loss scaling every n "
+                 "accumulated steps with nan or inf gradients.");
+    AddAttr<float>("incr_ratio",
+                   "The multiplier to use when increasing the loss scaling.")
+        .AddCustomChecker([](float incr_ratio) {
+          PADDLE_ENFORCE_EQ(incr_ratio > 1.0f, true,
+                            platform::errors::InvalidArgument(
+                                "'incr_ratio' should be greater than 1, but "
+                                "the received is %f",
+                                incr_ratio));
+        });
+    AddAttr<float>(
+        "decr_ratio",
+        "The less-than-one-multiplier to use when decreasing loss scaling.")
+        .AddCustomChecker([](float decr_ratio) {
+          PADDLE_ENFORCE_EQ(decr_ratio > 0.0f && decr_ratio < 1.0f, true,
+                            platform::errors::InvalidArgument(
+                                "'incr_ratio' should be between 0 and 1, but "
+                                "the received is %f",
+                                decr_ratio));
+        });
+    AddComment(R"DOC(
+Update loss scaling according to overall gradients. If all gradients is 
+finite after incr_every_n_steps, loss scaling will increase by incr_ratio. 
+Otherwise, loss scaling will decrease by decr_ratio after
+decr_every_n_nan_or_inf steps and each step some gradients are infinite.
+
+)DOC");
+  }
+};
+
+template <typename T>
+class UpdateLossScalingFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const bool* found_inf_data, const T* pre_loss_scaling_data,
+                  const int* good_in_data, const int* bad_in_data,
+                  const int incr_every_n_steps,
+                  const int decr_every_n_nan_or_inf, const float incr_ratio,
+                  const float decr_ratio, T* updated_loss_scaling_data,
+                  int* good_out_data, int* bad_out_data) const {
+    Update<T>(found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
+              incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
+              decr_ratio, updated_loss_scaling_data, good_out_data,
+              bad_out_data);
+  }
+};
+
+template <typename T>
+class LazyZeroInputs<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& dev_ctx,
+                  const bool* found_inf_data,
+                  const std::vector<const framework::Tensor*>& xs,
+                  const std::vector<framework::Tensor*>& outs) const {
+    if (*found_inf_data) {
+      VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --";
+      for (size_t i = 0; i < xs.size(); ++i) {
+        auto* out = outs[i];
+        T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+        int num = out->numel();
+        std::memset(out_data, 0, num * sizeof(T));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(
+    update_loss_scaling, ops::UpdateLossScalingOp,
+    ops::UpdateLossScalingOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(update_loss_scaling,
+                       ops::UpdateLossScalingKernel<CPU, float>,
+                       ops::UpdateLossScalingKernel<CPU, double>);
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cu
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void GpuUpdateLossScaling(
+    const bool* found_inf_data, const T* pre_loss_scaling_data,
+    const int* good_in_data, const int* bad_in_data,
+    const int incr_every_n_steps, const int decr_every_n_nan_or_inf,
+    const float incr_ratio, const float decr_ratio,
+    T* updated_loss_scaling_data, int* good_out_data, int* bad_out_data) {
+  Update<T>(found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
+            incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
+            updated_loss_scaling_data, good_out_data, bad_out_data);
+}
+
+template <typename T>
+class UpdateLossScalingFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& dev_ctx,
+                  const bool* found_inf_data, const T* pre_loss_scaling_data,
+                  const int* good_in_data, const int* bad_in_data,
+                  const int incr_every_n_steps,
+                  const int decr_every_n_nan_or_inf, const float incr_ratio,
+                  const float decr_ratio, T* updated_loss_scaling_data,
+                  int* good_out_data, int* bad_out_data) const {
+    GpuUpdateLossScaling<T><<<1, 1, 0, dev_ctx.stream()>>>(
+        found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
+        incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
+        updated_loss_scaling_data, good_out_data, bad_out_data);
+  }
+};
+
+template <typename T>
+class LazyZeroInputs<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& dev_ctx,
+                  const bool* found_inf_data,
+                  const std::vector<const framework::Tensor*>& xs,
+                  const std::vector<framework::Tensor*>& outs) const {
+    const auto gpu_place =
+        BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
+    bool has_inf{false};
+    memory::Copy(platform::CPUPlace(), &has_inf, gpu_place, found_inf_data,
+                 sizeof(bool), dev_ctx.stream());
+    if (has_inf) {
+      VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --";
+      for (size_t i = 0; i < xs.size(); ++i) {
+        auto* out = outs[i];
+        T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+        int num = out->numel();
+        cudaMemset(out_data, 0, num * sizeof(T));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using GPU = paddle::platform::CUDADeviceContext;
+
+REGISTER_OP_CUDA_KERNEL(update_loss_scaling,
+                        ops::UpdateLossScalingKernel<GPU, float>,
+                        ops::UpdateLossScalingKernel<GPU, double>);
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.h
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cmath>
+#include <vector>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/errors.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+HOSTDEVICE void Update(const bool* found_inf_data,
+                       const T* pre_loss_scaling_data, const int* good_in_data,
+                       const int* bad_in_data, const int incr_every_n_steps,
+                       const int decr_every_n_nan_or_inf,
+                       const float incr_ratio, const float decr_ratio,
+                       T* updated_loss_scaling_data, int* good_out_data,
+                       int* bad_out_data) {
+  if (*found_inf_data) {
+    *good_out_data = 0;
+    *bad_out_data = *bad_in_data + 1;
+    if (*bad_out_data == decr_every_n_nan_or_inf) {
+      T new_loss_scaling = *pre_loss_scaling_data * decr_ratio;
+      *updated_loss_scaling_data = new_loss_scaling < static_cast<T>(1)
+                                       ? static_cast<T>(1)
+                                       : new_loss_scaling;
+      *bad_out_data = 0;
+    }
+  } else {
+    *bad_out_data = 0;
+    *good_out_data = *good_in_data + 1;
+    if (*good_out_data == incr_every_n_steps) {
+      T new_loss_scaling = *pre_loss_scaling_data * incr_ratio;
+      *updated_loss_scaling_data = std::isfinite(new_loss_scaling)
+                                       ? new_loss_scaling
+                                       : *pre_loss_scaling_data;
+      *good_out_data = 0;
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class UpdateLossScalingFunctor {
+ public:
+  void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data,
+                  const T* pre_loss_scaling_data, const int* good_in_data,
+                  const int* bad_in_data, const int incr_every_n_steps,
+                  const int decr_every_n_nan_or_inf, const float incr_ratio,
+                  const float decr_ratio, T* updated_loss_scaling_data,
+                  int* good_out_data, int* bad_out_data) const;
+};
+
+template <typename DeviceContext, typename T>
+class LazyZeroInputs {
+ public:
+  void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data,
+                  const std::vector<const framework::Tensor*>& xs,
+                  const std::vector<framework::Tensor*>& outs) const;
+};
+
+template <typename DeviceContext, typename T>
+class UpdateLossScalingKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto xs = ctx.MultiInput<framework::Tensor>("X");
+    const auto* found_inf = ctx.Input<Tensor>("FoundInfinite");
+    const auto* pre_loss_scaling = ctx.Input<Tensor>("PrevLossScaling");
+    const auto* good_in = ctx.Input<Tensor>("InGoodSteps");
+    const auto* bad_in = ctx.Input<Tensor>("InBadSteps");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto* updated_loss_scaling = ctx.Output<Tensor>("LossScaling");
+    auto* good_out = ctx.Output<Tensor>("OutGoodSteps");
+    auto* bad_out = ctx.Output<Tensor>("OutBadSteps");
+
+    PADDLE_ENFORCE_EQ(found_inf->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "FoundInfinite must has only one element."));
+
+    const bool* found_inf_data = found_inf->data<bool>();
+    const T* pre_loss_scaling_data = pre_loss_scaling->data<T>();
+    const int* good_in_data = good_in->data<int>();
+    const int* bad_in_data = bad_in->data<int>();
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    T* updated_loss_scaling_data =
+        updated_loss_scaling->mutable_data<T>(dev_ctx.GetPlace());
+    int* good_out_data = good_out->mutable_data<int>(dev_ctx.GetPlace());
+    int* bad_out_data = bad_out->mutable_data<int>(dev_ctx.GetPlace());
+
+    const int incr_every_n_steps = ctx.Attr<int>("incr_every_n_steps");
+    const int decr_every_n_nan_or_inf =
+        ctx.Attr<int>("decr_every_n_nan_or_inf");
+    const float incr_ratio = ctx.Attr<float>("incr_ratio");
+    const float decr_ratio = ctx.Attr<float>("decr_ratio");
+    UpdateLossScalingFunctor<DeviceContext, T>{}(
+        dev_ctx, found_inf_data, pre_loss_scaling_data, good_in_data,
+        bad_in_data, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
+        decr_ratio, updated_loss_scaling_data, good_out_data, bad_out_data);
+    LazyZeroInputs<DeviceContext, T>{}(dev_ctx, found_inf_data, xs, outs);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
@@ -111,8 +111,16 @@ class CompareOp : public framework::OperatorWithKernel {
    framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
    // CompareOp kernel's device type is decided by input tensor place
    bool force_cpu = ctx.Attr<bool>("force_cpu");
-    kt.place_ = force_cpu ? platform::CPUPlace()
-                          : ctx.Input<framework::LoDTensor>("X")->place();
+    if (force_cpu) {
+      kt.place_ = platform::CPUPlace();
+    } else {
+      if (ctx.Input<framework::LoDTensor>("X")->place().type() !=
+          typeid(platform::CUDAPinnedPlace)) {
+        kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
+      } else {
+        kt.place_ = ctx.GetPlace();
+      }
+    }
    return kt;
  }
 };

--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -41,9 +41,13 @@ detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_fo
 detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc)

 if(WITH_GPU)
-  detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
-  detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS memory cub)
-  detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc collect_fpn_proposals_op.cu DEPS memory cub)
+  set(TMPDEPS memory)
+  if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+      set(TMPDEPS memory cub)
+  endif()
+  detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS ${TMPDEPS})
+  detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS ${TMPDEPS})
+  detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc collect_fpn_proposals_op.cu DEPS ${TMPDEPS})
 else()
  detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
  detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc)

--- a/paddle/fluid/operators/dist_op.h
+++ b/paddle/fluid/operators/dist_op.h
@@ -176,14 +176,26 @@ static void DistGradFunction(const framework::ExecutionContext& context) {
  } else if (p == INFINITY || p == -INFINITY) {
    // p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if
    // j!=i, or equals to sign(z_i) * dout if j=i.
-    grad_t.device(place) =
-        (x_minux_y_abs == out_t.broadcast(out_bcast_dims)).template cast<T>() *
-        sign * out_grad_t.broadcast(out_bcast_dims);
+    if (platform::is_cpu_place(context.GetPlace())) {
+      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
+                                 .template cast<T>() *
+                             sign.eval() * out_grad_t.broadcast(out_bcast_dims);
+    } else {
+      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
+                                 .template cast<T>() *
+                             sign * out_grad_t.broadcast(out_bcast_dims);
+    }
  } else {
    // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout
-    grad_t.device(place) =
-        (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * sign *
-        out_grad_t.broadcast(out_bcast_dims);
+    if (platform::is_cpu_place(context.GetPlace())) {
+      grad_t.device(place) =
+          (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) *
+          sign.eval() * out_grad_t.broadcast(out_bcast_dims);
+    } else {
+      grad_t.device(place) =
+          (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * sign *
+          out_grad_t.broadcast(out_bcast_dims);
+    }
  }

  Eigen::DSizes<int, Rank * 2> x_reshape_dims;

--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
@@ -49,8 +49,6 @@ REGISTER_OP_WITHOUT_GRADIENT(elementwise_floordiv, ops::ElementwiseOp,

 REGISTER_OP_CPU_KERNEL(
    elementwise_floordiv,
-    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, double>,
    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, int>,
    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext,
                                   int64_t>);
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
@@ -19,7 +19,5 @@ namespace plat = paddle::platform;

 REGISTER_OP_CUDA_KERNEL(
    elementwise_floordiv,
-    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, double>,
    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int>,
    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
@@ -14,7 +14,6 @@ limitations under the License. */

 #pragma once

-#include <math.h>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
@@ -62,15 +61,8 @@ void elementwise_floor_div(const framework::ExecutionContext &ctx,
                           const framework::Tensor *x,
                           const framework::Tensor *y, framework::Tensor *z) {
  int axis = ctx.Attr<int>("axis");
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  if (x_dims.size() >= y_dims.size()) {
-    ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
-        ctx, x, y, axis, FloorDivFunctor<T>(), z);
-  } else {
-    ElementwiseComputeEx<InverseFloorDivFunctor<T>, DeviceContext, T>(
-        ctx, x, y, axis, InverseFloorDivFunctor<T>(), z);
-  }
+  ElementwiseComputeEx<FloorDivFunctor<T>, DeviceContext, T>(
+      ctx, x, y, axis, FloorDivFunctor<T>(), z);
 }

 template <typename DeviceContext, typename T>

--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -33,22 +33,7 @@ class ElementwiseMulOp : public ElementwiseOp {
    auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");

 #ifdef PADDLE_WITH_MKLDNN
-    using mkldnn::memory;
-    auto CanMKLDNNElementwiseMulBeUsed = [&]() {
-      auto x_dims = ctx.Input<Tensor>("X")->dims();
-      auto y_dims = ctx.Input<Tensor>("Y")->dims();
-      int rankdiff = x_dims.size() - y_dims.size();
-      // TODO(jczaja): Remove this when oneDNN performance for scalar
-      // broadcasting
-      // is improved (Ernie large situation)
-      if (rankdiff != 0 && y_dims.size() == 1 && y_dims[0] == 1) {
-        return false;
-      }
-
-      return true;
-    };
-
-    if (platform::CanMKLDNNBeUsed(ctx) && CanMKLDNNElementwiseMulBeUsed()) {
+    if (platform::CanMKLDNNBeUsed(ctx)) {
      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
                                     framework::DataLayout::kMKLDNN,
                                     framework::LibraryType::kMKLDNN);

--- a/paddle/fluid/operators/empty_op.cc
+++ b/paddle/fluid/operators/empty_op.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/empty_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class EmptyOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("ShapeTensor",
+             "(Tensor<int>), optional). The shape of the output."
+             "It has a higher priority than Attr(shape).")
+        .AsDispensable();
+    AddInput("ShapeTensorList",
+             "(vector<Tensor<int>>, optional). The shape of the output. "
+             "It has a higher priority than Attr(shape)."
+             "The shape of the element in vector must be [1].")
+        .AsDuplicable()
+        .AsDispensable();
+    AddAttr<std::vector<int64_t>>("shape",
+                                  "(vector<int64_t>) The shape of the output")
+        .SetDefault({});
+    AddAttr<int>("dtype", "The data type of output tensor, Default is float")
+        .SetDefault(framework::proto::VarType::FP32);
+    AddOutput("Out", "(Tensor) The output tensor.");
+    AddComment(R"DOC(empty operator
+Returns a tensor filled with uninitialized data. The shape of the tensor is
+defined by the variable argument shape.
+
+
+The type of the tensor is specify by `dtype`.
+)DOC");
+  }
+};
+
+class EmptyOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* context) const override {
+    OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "empty");
+
+    if (context->HasInput("ShapeTensor")) {
+      auto dims = context->GetInputDim("ShapeTensor");
+      int num_ele = 1;
+      for (int i = 0; i < dims.size(); ++i) {
+        num_ele *= dims[i];
+      }
+
+      context->SetOutputDim("Out", framework::make_ddim({num_ele}));
+    } else if (context->HasInputs("ShapeTensorList")) {
+      std::vector<int> out_dims;
+      auto dims_list = context->GetInputsDim("ShapeTensorList");
+      for (size_t i = 0; i < dims_list.size(); ++i) {
+        auto& dims = dims_list[i];
+        PADDLE_ENFORCE_EQ(
+            dims, framework::make_ddim({1}),
+            "ShapeError: The shape of Tensor in list must be [1]. "
+            "But received the shape "
+            "is [%s]",
+            dims);
+
+        out_dims.push_back(dims[0]);
+      }
+
+      context->SetOutputDim("Out", framework::make_ddim(out_dims));
+    } else {
+      auto& shape = context->Attrs().Get<std::vector<int64_t>>("shape");
+      context->SetOutputDim("Out", framework::make_ddim(shape));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const framework::Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    if (var_name == "ShapeTensor" || var_name == "ShapeTensorList") {
+      return expected_kernel_type;
+    } else {
+      return framework::OpKernelType(expected_kernel_type.data_type_,
+                                     tensor.place(), tensor.layout());
+    }
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& context) const override {
+    return framework::OpKernelType(
+        framework::proto::VarType::Type(context.Attr<int>("dtype")),
+        context.GetPlace());
+  }
+};
+
+class EmptyOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext* context) const override {
+    auto data_type = static_cast<framework::proto::VarType::Type>(
+        BOOST_GET_CONST(int, context->GetAttr("dtype")));
+    context->SetOutputDataType("Out", data_type);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(
+    empty, ops::EmptyOp, ops::EmptyOpMaker, ops::EmptyOpVarTypeInference,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(empty, ops::EmptyKernel<plat::CPUDeviceContext, bool>,
+                       ops::EmptyKernel<plat::CPUDeviceContext, int>,
+                       ops::EmptyKernel<plat::CPUDeviceContext, int64_t>,
+                       ops::EmptyKernel<plat::CPUDeviceContext, float>,
+                       ops::EmptyKernel<plat::CPUDeviceContext, double>,
+                       ops::EmptyKernel<plat::CPUDeviceContext, plat::float16>);
--- a/paddle/fluid/operators/empty_op.cu.cc
+++ b/paddle/fluid/operators/empty_op.cu.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/empty_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    empty, ops::EmptyKernel<plat::CUDADeviceContext, bool>,
+    ops::EmptyKernel<plat::CUDADeviceContext, int>,
+    ops::EmptyKernel<plat::CUDADeviceContext, int64_t>,
+    ops::EmptyKernel<plat::CUDADeviceContext, float>,
+    ops::EmptyKernel<plat::CUDADeviceContext, double>,
+    ops::EmptyKernel<plat::CUDADeviceContext, plat::float16>);
--- a/paddle/fluid/operators/empty_op.h
+++ b/paddle/fluid/operators/empty_op.h
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/utils.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class EmptyKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto dtype = static_cast<framework::proto::VarType::Type>(
+        context.Attr<int>("dtype"));
+
+    Tensor *out_tensor = context.Output<Tensor>("Out");
+
+    auto shape = GetShape(context);
+    out_tensor->Resize(shape);
+
+    out_tensor->mutable_data(context.GetPlace(), dtype);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -228,6 +228,26 @@ class ExpandGradOpMaker : public framework::SingleGradOpMaker<T> {
  }
 };

+template <typename T>
+class ExpandDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    if (this->HasInput("expand_times_tensor")) {
+      op->SetInput("expand_times_tensor", this->Input("expand_times_tensor"));
+    }
+    if (this->HasInput("ExpandTimes")) {
+      op->SetInput("ExpandTimes", this->Input("ExpandTimes"));
+    }
+    op->SetAttrMap(this->Attrs());
+    op->SetType("expand");
+  }
+};
+
 DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandGradNoNeedBufVarsInferer, "X");

 }  // namespace operators
@@ -238,6 +258,8 @@ REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker,
                  ops::ExpandGradOpMaker<paddle::framework::OpDesc>,
                  ops::ExpandGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp,
+                  ops::ExpandDoubleGradOpMaker<paddle::framework::OpDesc>,
+                  ops::ExpandDoubleGradOpMaker<paddle::imperative::OpBase>,
                  ops::ExpandGradNoNeedBufVarsInferer);
 REGISTER_OP_CPU_KERNEL(
    expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>,

--- a/paddle/fluid/operators/expand_v2_op.cc
+++ b/paddle/fluid/operators/expand_v2_op.cc
@@ -230,6 +230,26 @@ class ExpandV2GradOpMaker : public framework::SingleGradOpMaker<T> {
  }
 };

+template <typename T>
+class ExpandV2DoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("expand_v2");
+    op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    if (this->HasInput("expand_shapes_tensor")) {
+      op->SetInput("expand_shapes_tensor", this->Input("expand_shapes_tensor"));
+    }
+    if (this->HasInput("Shape")) {
+      op->SetInput("Shape", this->Input("Shape"));
+    }
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
 DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandV2GradNoNeedBufVarsInferer, "X");

 }  // namespace operators
@@ -240,6 +260,8 @@ REGISTER_OPERATOR(expand_v2, ops::ExpandV2Op, ops::ExpandV2OpMaker,
                  ops::ExpandV2GradOpMaker<paddle::framework::OpDesc>,
                  ops::ExpandV2GradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(expand_v2_grad, ops::ExpandV2GradOp,
+                  ops::ExpandV2DoubleGradOpMaker<paddle::framework::OpDesc>,
+                  ops::ExpandV2DoubleGradOpMaker<paddle::imperative::OpBase>,
                  ops::ExpandV2GradNoNeedBufVarsInferer);
 REGISTER_OP_CPU_KERNEL(
    expand_v2, ops::ExpandV2Kernel<paddle::platform::CPUDeviceContext, float>,

--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
@@ -27,27 +27,6 @@ namespace operators {

 using Tensor = framework::Tensor;

-inline framework::DDim GetShape(const framework::ExecutionContext &ctx,
-                                std::string op_type) {
-  // 1. shape is a Tensor
-  if (ctx.HasInput("ShapeTensor")) {
-    auto *shape_tensor = ctx.Input<framework::LoDTensor>("ShapeTensor");
-    auto vec_shape = GetDataFromTensor<int>(shape_tensor);
-    return framework::make_ddim(vec_shape);
-  }
-
-  // 2. shape is a list/tuple containing Tensor
-  auto shape_tensor_list = ctx.MultiInput<framework::Tensor>("ShapeTensorList");
-  if (shape_tensor_list.size() > 0) {
-    auto vec_shape = GetDataFromTensorList(shape_tensor_list);
-    return framework::make_ddim(vec_shape);
-  }
-
-  // 3. shape is a list/tuple without containing Tensor
-  auto vec_shape = ctx.Attr<std::vector<int64_t>>("shape");
-  return framework::make_ddim(vec_shape);
-}
-
 template <typename T>
 class FillConstantKernel : public framework::OpKernel<T> {
 public:
@@ -93,8 +72,7 @@ class FillConstantKernel : public framework::OpKernel<T> {
      }
      value = tensor_data[0];
    }
-    const std::string op_type = "fill_constant";
-    auto shape = GetShape(ctx, op_type);
+    auto shape = GetShape(ctx);

    if (out_var->IsType<framework::LoDTensor>()) {
      tensor = out_var->GetMutable<framework::LoDTensor>();

--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -367,8 +367,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
    auto blas = math::GetBlas<DeviceContext, T>(ctx);

    for (int64_t i = 0; i < ids_numel; ++i) {
-      PADDLE_ENFORCE_LT(ids_data[i], row_number);
-      PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i);
+      PADDLE_ENFORCE_LT(
+          ids_data[i], row_number,
+          platform::errors::OutOfRange(
+              "Value of Ids %d should less than dict size %d.", i, row_number));
+      PADDLE_ENFORCE_GE(ids_data[i], 0,
+                        platform::errors::OutOfRange(
+                            "Value of Ids %d should greater than ZERO.", i));
      memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width,
             row_width * sizeof(T));
    }
@@ -473,8 +478,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);

    for (int64_t i = 0; i < ids_numel; ++i) {
-      PADDLE_ENFORCE_LT(ids_data[i], row_number);
-      PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i);
+      PADDLE_ENFORCE_LT(
+          ids_data[i], row_number,
+          platform::errors::OutOfRange(
+              "Value of Ids %d should less than dict size %d.", i, row_number));
+      PADDLE_ENFORCE_GE(ids_data[i], 0,
+                        platform::errors::OutOfRange(
+                            "Value of Ids %d should greater than ZERO.", i));
      memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width,
             row_width * sizeof(T));
    }

--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -30,16 +30,18 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fusion_gru");
  OP_INOUT_CHECK(ctx->HasInput("WeightX"), "Input", "WeightX", "fusion_gru");
  OP_INOUT_CHECK(ctx->HasInput("WeightH"), "Input", "WeightH", "fusion_gru");
-
  OP_INOUT_CHECK(ctx->HasOutput("XX"), "Output", "XX", "fusion_gru");
  OP_INOUT_CHECK(ctx->HasOutput("Hidden"), "Output", "Hidden", "fusion_gru");
-
  auto x_dims = ctx->GetInputDim("X");
-  PADDLE_ENFORCE_EQ(x_dims.size(), 2,
-                    platform::errors::InvalidArgument(
-                        "Input(X)'s rank must be 2, but received input dim "
-                        "size is:%d, input dim is:[%s]",
-                        x_dims.size(), x_dims));
+  auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)
+                        ? framework::flatten_to_2d(x_dims, 1)
+                        : x_dims;
+  PADDLE_ENFORCE_EQ(
+      x_mat_dims.size(), 2,
+      platform::errors::InvalidArgument("The size of input X dims should be 2, "
+                                        "or 3 with second dimension equal to "
+                                        "1, but now Input X dim is:[%s] ",
+                                        x_dims));

  auto wx_dims = ctx->GetInputDim("WeightX");
  PADDLE_ENFORCE_EQ(wx_dims.size(), 2,
@@ -47,12 +49,14 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
                        "The rank of Input(WeightX) should be 2, but received "
                        "WeightX dim size is:%d, WeightX dim is:[%s] ",
                        wx_dims.size(), wx_dims));
-  PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1],
-                    platform::errors::InvalidArgument(
-                        "The first dimension of Input(WeightX) "
-                        "should equal to second dimension of input x, but "
-                        "received WeightX dimension is:%d, x dimension is:%d",
-                        wx_dims[0], x_dims[1]));
+  PADDLE_ENFORCE_EQ(
+      wx_dims[0], x_mat_dims[1],
+      platform::errors::InvalidArgument(
+          "The first dimension of flattened WeightX"
+          "should equal to last dimension of flattened input X, but "
+          "received fattened WeightX dimension is:%d, flattened X dimension "
+          "is:%d",
+          wx_dims[0], x_mat_dims[1]));

  int frame_size = wx_dims[1] / 3;
  auto wh_dims = ctx->GetInputDim("WeightH");
@@ -102,24 +106,24 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
                          "received bias dim is:[%s], frame size is:%d",
                          b_dims, frame_size));
  }
-  framework::DDim out_dims({x_dims[0], frame_size});
+  framework::DDim out_dims({x_mat_dims[0], frame_size});
  ctx->SetOutputDim("Hidden", out_dims);
  ctx->ShareLoD("X", "Hidden");
  int xx_width;
  if (ctx->Attrs().Get<bool>("use_seq")) {
    xx_width = wx_dims[1];
  } else {
-    xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+    xx_width = x_mat_dims[1] > wx_dims[1] ? wx_dims[1] : x_mat_dims[1];
    OP_INOUT_CHECK(ctx->HasOutput("ReorderedH0"), "Output", "ReorderedH0",
                   "fusion_gru");
    OP_INOUT_CHECK(ctx->HasOutput("BatchedInput"), "Output", "BatchedInput",
                   "fusion_gru");
    OP_INOUT_CHECK(ctx->HasOutput("BatchedOut"), "Output", "BatchedOut",
                   "fusion_gru");
-    ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
+    ctx->SetOutputDim("BatchedInput", {x_mat_dims[0], wx_dims[1]});
    ctx->SetOutputDim("BatchedOut", out_dims);
  }
-  ctx->SetOutputDim("XX", {x_dims[0], xx_width});
+  ctx->SetOutputDim("XX", {x_mat_dims[0], xx_width});
  ctx->ShareLoD("X", "XX");
 }

@@ -202,6 +206,27 @@ void FusionGRUOpMaker::Make() {
  AddAttr<bool>("use_mkldnn",
                "(bool, default false) Only used in mkldnn kernel")
      .SetDefault(false);
+  AddAttr<std::string>(
+      "mkldnn_data_type",
+      "(string, default \"float32\"). Data type of mkldnn kernel")
+      .SetDefault("float32")
+      .InEnum({"float32", "int8", "bfloat16"});
+  AddAttr<float>("Scale_data",
+                 "Scale to be used for int8 input/output data."
+                 "Only used with MKL-DNN INT8.")
+      .SetDefault(1.0f);
+  AddAttr<float>("Shift_data",
+                 "Shift to be used for int8 input/output data."
+                 "Only used with MKL-DNN INT8.")
+      .SetDefault(0.0f);
+  AddAttr<std::vector<float>>("Scale_weights",
+                              "Scale_weights to be used for int8 weights data."
+                              "Only used with MKL-DNN INT8.")
+      .SetDefault({1.0f});
+  AddAttr<bool>("force_fp32_output",
+                "(bool, default false) Force INT8 kernel output FP32, only "
+                "used in MKL-DNN INT8")
+      .SetDefault(false);
  AddComment(R"DOC(
 The Fusion complete GRU Operator.
 This operator fuse the fully-connected operator into GRU, 
@@ -220,14 +245,17 @@ class FusionGRUKernel : public framework::OpKernel<T> {
    }
  }

-#define INIT_BASE_DEFINES                  \
-  auto* x = ctx.Input<LoDTensor>("X");     \
-  auto* wh = ctx.Input<Tensor>("WeightH"); \
-  auto* xx = ctx.Output<LoDTensor>("XX");  \
-  auto x_lod = x->lod();                   \
-  auto x_dims = x->dims();   /* T x M*/    \
-  auto wh_dims = wh->dims(); /* D x 3D*/   \
-  const int total_T = x_dims[0];           \
+#define INIT_BASE_DEFINES                                     \
+  auto* x = ctx.Input<LoDTensor>("X");                        \
+  auto* wh = ctx.Input<Tensor>("WeightH");                    \
+  auto* xx = ctx.Output<LoDTensor>("XX");                     \
+  auto x_lod = x->lod();                                      \
+  auto x_dims = x->dims(); /* T x M*/                         \
+  auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)    \
+                        ? framework::flatten_to_2d(x_dims, 1) \
+                        : x_dims;                             \
+  auto wh_dims = wh->dims(); /* D x 3D*/                      \
+  const int total_T = x_mat_dims[0];                          \
  const int D3 = wh_dims[1]

 #define INIT_OTHER_DEFINES                                                   \
@@ -236,7 +264,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
  auto* bias = ctx.Input<Tensor>("Bias");                                    \
  auto* hidden_out = ctx.Output<LoDTensor>("Hidden");                        \
  bool is_reverse = ctx.Attr<bool>("is_reverse");                            \
-  const int M = x_dims[1];                                                   \
+  const int M = x_mat_dims[1];                                               \
  const int D = wh_dims[0];                                                  \
  const int D2 = D * 2;                                                      \
  const jit::gru_attr_t attr(                                                \

--- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
@@ -21,11 +21,12 @@ namespace operators {
 using paddle::framework::LoDTensor;
 using paddle::framework::Tensor;
 using paddle::platform::CPUDeviceContext;
+using paddle::platform::CreateKey;
 using paddle::platform::MKLDNNGetDataType;
 using paddle::platform::MKLDNNMemDesc;
 using platform::to_void_cast;

-template <typename T>
+template <typename T, typename T_out = T>
 class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
 public:
  GRUMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
@@ -38,7 +39,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
                   const std::string& unique_name)
      : platform::MKLDNNHandlerT<T, dnnl::gru_forward>(
            dev_ctx, dev_ctx.GetEngine(), cpu_place,
-            platform::CreateKey(unique_name, Ti)),
+            CreateKey(unique_name, MKLDNNGetDataType<T>(), Ti)),
        N(N),
        Ti(Ti),
        IC(IC),
@@ -47,9 +48,29 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
    // do not depend on Ti size but primitive and input/output memory do
    if (platform::MKLDNNDeviceContext::tls().get_cur_mkldnn_session_id() !=
        platform::MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_Default) {
-      memory_key_ = unique_name;
+      memory_key_ = CreateKey(unique_name, MKLDNNGetDataType<T>());
    } else {
-      memory_key_ = unique_name + "-t:" + platform::ThreadIDasStr();
+      memory_key_ = CreateKey(unique_name, MKLDNNGetDataType<T>(), "-t:",
+                              platform::ThreadIDasStr());
+    }
+
+    // Is it int8 kernel
+    const bool is_INT8 = std::is_same<T, uint8_t>::value;
+
+    if (is_INT8) {
+      // Int8 attributes
+      const float scale_data = ctx.Attr<float>("Scale_data");
+      const float shift_data = ctx.Attr<float>("Shift_data");
+      const auto scale_weights = ctx.Attr<std::vector<float>>("Scale_weights");
+
+      const int weights_scale_mask =
+          0 +
+          (1 << 3)  // bit, indicating the unique scales for `g` dim in `ldigo`
+          +
+          (1 << 4);  // bit, indicating the unique scales for `o` dim in `ldigo`
+
+      attr_.set_rnn_data_qparams(scale_data, shift_data);
+      attr_.set_rnn_weights_qparams(weights_scale_mask, scale_weights);
    }

    if (!this->isCached()) {
@@ -63,6 +84,10 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
          platform::errors::Unimplemented(
              "oneDNN fusion_gru supports only tanh as an activation."));

+      // Weights for int8 kernel are of a type s8
+      const auto weights_dt =
+          is_INT8 ? dnnl::memory::data_type::s8 : dnnl::memory::data_type::f32;
+
      // oneDNN RNN dimensions
      const int64_t D = 1;  // Directions
      const int64_t L = 1;  // Layers (PP supports only 1 stacked layer)
@@ -71,19 +96,16 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
      // Create memory descriptors
      auto input_md = MKLDNNMemDesc({Ti, N, IC}, MKLDNNGetDataType<T>(),
                                    MKLDNNMemoryFormat::any);
-      auto weight_x_md = MKLDNNMemDesc(
-          {L, D, IC, G, OC}, MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::any);
-      auto weight_h_md = MKLDNNMemDesc(
-          {L, D, OC, G, OC}, MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::any);
+      auto weight_x_md =
+          MKLDNNMemDesc({L, D, IC, G, OC}, weights_dt, MKLDNNMemoryFormat::any);
+      auto weight_h_md =
+          MKLDNNMemDesc({L, D, OC, G, OC}, weights_dt, MKLDNNMemoryFormat::any);
      auto bias_md = MKLDNNMemDesc({L, D, G, OC}, MKLDNNGetDataType<float>(),
                                   MKLDNNMemoryFormat::ldgo);
-      auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T>(),
+      auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T_out>(),
                                     MKLDNNMemoryFormat::any);
-      auto h0_md = dnnl::memory::desc();
-      if (h0) {
-        h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
-                              MKLDNNMemoryFormat::ldnc);
-      }
+      auto h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
+                                 MKLDNNMemoryFormat::ldnc);

      // Create GRU oneDNN primitive
      const auto direction =
@@ -91,7 +113,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
                     : dnnl::rnn_direction::unidirectional_left2right;

      this->AcquireForwardPrimitiveDescriptor(
-          dnnl::prop_kind::forward_inference, direction, input_md, h0_md,
+          attr_, dnnl::prop_kind::forward_inference, direction, input_md, h0_md,
          weight_x_md, weight_h_md, bias_md, hidden_md, dnnl::memory::desc());
    }
  }
@@ -101,29 +123,31 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
            dnnl::memory::format_tag::ntc);
  }

-  void reorderRNNdata(const T* input_data, T* output_data,
+  void reorderRNNdata(void* input_data, void* output_data,
                      std::vector<size_t> lod, const bool is_reverse,
                      platform::RNNReorderType reorder_type) {
    switch (reorder_type) {
      // Reorder input memory [WORDS, C] + LoD -> [N, T, C]
      case platform::RNNReorderType::PP_NTC: {
-        auto* input_data_iter = input_data;
+        auto* input_data_iter = reinterpret_cast<T*>(input_data);
+        auto* output_data_iter = reinterpret_cast<T*>(output_data);
        for (int n = 0; n < N; ++n) {
          const auto num_elements = (lod[n + 1] - lod[n]) * IC;
          const auto offset = is_reverse ? (Ti * IC - num_elements) : 0;
-          memcpy(output_data + n * Ti * IC + offset, input_data_iter,
+          memcpy(output_data_iter + n * Ti * IC + offset, input_data_iter,
                 sizeof(T) * num_elements);
          input_data_iter += num_elements;
        }
      } break;
      // Reorder input memory [WORDS, C] + LoD -> [T, N, C]
      case platform::RNNReorderType::PP_TNC: {
-        auto* input_data_iter = input_data;
+        auto* input_data_iter = reinterpret_cast<T*>(input_data);
+        auto* output_data_iter = reinterpret_cast<T*>(output_data);
        for (int n = 0; n < N; ++n) {
          const auto num_elements = (lod[n + 1] - lod[n]);
          const auto offset = is_reverse ? (Ti - num_elements) : 0;
          for (size_t t = 0; t < num_elements; ++t) {
-            memcpy(output_data + (t + offset) * N * IC + n * IC,
+            memcpy(output_data_iter + (t + offset) * N * IC + n * IC,
                   input_data_iter, sizeof(T) * IC);
            input_data_iter += IC;
          }
@@ -131,24 +155,27 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
      } break;
      // Reorder output values to PP format [N, T, C] -> [WORDS, C]
      case platform::RNNReorderType::NTC_PP: {
-        auto* output_data_iter = output_data;
+        auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
+        auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
        for (int n = 0; n < N; ++n) {
          const auto num_elements = (lod[n + 1] - lod[n]) * OC;
          const auto offset = is_reverse ? (Ti * OC - num_elements) : 0;
-          memcpy(output_data_iter, input_data + n * Ti * OC + offset,
-                 sizeof(T) * num_elements);
+          memcpy(output_data_iter, input_data_iter + n * Ti * OC + offset,
+                 sizeof(T_out) * num_elements);
          output_data_iter += num_elements;
        }
      } break;
      // Reorder output values to PP format [T, N, C] -> [WORDS, C]
      case platform::RNNReorderType::TNC_PP: {
-        auto* output_data_iter = output_data;
+        auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
+        auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
        for (int n = 0; n < N; ++n) {
          const auto num_elements = lod[n + 1] - lod[n];
          const auto offset = is_reverse ? (Ti - num_elements) : 0;
          for (size_t t = 0; t < num_elements; ++t) {
            memcpy(output_data_iter,
-                   input_data + (t + offset) * N * OC + n * OC, sizeof(T) * OC);
+                   input_data_iter + (t + offset) * N * OC + n * OC,
+                   sizeof(T_out) * OC);
            output_data_iter += OC;
          }
        }
@@ -169,9 +196,9 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
    }

    const auto& input_lod = input->lod()[0];
-    auto* x_data = input->data<T>();
+    auto* x_data = to_void_cast(input->data<T>());

-    auto* x_onednn_data = reinterpret_cast<T*>(memory_p->get_data_handle());
+    auto* x_onednn_data = memory_p->get_data_handle();
    memset(x_onednn_data, 0, sizeof(T) * N * Ti * IC);

    if (platform::GetMKLDNNFormat(this->fwd_pd_->src_desc()) ==
@@ -198,19 +225,35 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
    return memory_p;
  }

+  // TODO(grygielski) H0 is for now persistable
  std::shared_ptr<dnnl::memory> AcquireH0Memory(const Tensor* h0) {
    const std::string h0_key = memory_key_ + "@h0";
    auto memory_p =
        std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(h0_key));

-    auto* h0_data = to_void_cast(h0->data<T>());
-
    if (!memory_p) {
-      memory_p = std::make_shared<dnnl::memory>(
-          this->fwd_pd_->weights_layer_desc(), this->engine_, h0_data);
+      auto user_h0_memory = dnnl::memory();
+      if (h0) {
+        user_h0_memory =
+            dnnl::memory({{1, 1, N, OC},
+                          MKLDNNGetDataType<float>(),
+                          MKLDNNMemoryFormat::ldnc},
+                         this->engine_, to_void_cast(h0->data<float>()));
+      } else {
+        user_h0_memory = dnnl::memory({{1, 1, N, OC},
+                                       MKLDNNGetDataType<float>(),
+                                       MKLDNNMemoryFormat::ldnc},
+                                      this->engine_);
+        memset(user_h0_memory.get_data_handle(), 0, sizeof(float) * N * OC);
+      }
+      memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_iter_desc(),
+                                                this->engine_);
+
+      dnnl::stream astream(this->engine_);
+      dnnl::reorder(user_h0_memory, *memory_p, attr_)
+          .execute(astream, user_h0_memory, *memory_p);
+
      this->dev_ctx_.SetBlob(h0_key, memory_p);
-    } else {
-      memory_p->set_data_handle(h0_data);
    }
    return memory_p;
  }
@@ -245,7 +288,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
          this->fwd_pd_->weights_layer_desc(), this->engine_);

      dnnl::stream astream(this->engine_);
-      dnnl::reorder(user_memory, *memory_p)
+      dnnl::reorder(user_memory, *memory_p, attr_)
          .execute(astream, user_memory, *memory_p);

      this->dev_ctx_.SetBlob(wx_key, memory_p);
@@ -298,7 +341,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
          this->fwd_pd_->weights_iter_desc(), this->engine_);

      dnnl::stream astream(this->engine_);
-      dnnl::reorder(user_memory, *memory_p)
+      dnnl::reorder(user_memory, *memory_p, attr_)
          .execute(astream, user_memory, *memory_p);

      this->dev_ctx_.SetBlob(wh_key, memory_p);
@@ -347,12 +390,26 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
  // Memory size of weights, bias and h0 does not depend
  // on Ti size, thus we need another key to cache them
  std::string memory_key_;
+  dnnl::primitive_attr attr_;
 };

 template <typename T>
 class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    const bool is_INT8 = std::is_same<T, uint8_t>::value;
+    const bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
+
+    // TODO(grygielski) Add option for bfloat
+    if (!is_INT8 || force_fp32_output) {
+      RunKernel<float>(ctx);
+    } else {
+      RunKernel<uint8_t>(ctx);
+    }
+  }
+
+  template <typename Tout = T>
+  void RunKernel(const framework::ExecutionContext& ctx) const {
    auto& dev_ctx =
        ctx.template device_context<platform::MKLDNNDeviceContext>();
    const auto& mkldnn_engine = dev_ctx.GetEngine();
@@ -364,13 +421,16 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
    const auto* weight_h = ctx.Input<Tensor>("WeightH");
    const auto* bias = ctx.Input<Tensor>("Bias");
    auto* hidden = ctx.Output<LoDTensor>("Hidden");
-
+    auto x_dims = input->dims();
+    auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)
+                          ? framework::flatten_to_2d(x_dims, 1)
+                          : x_dims;
    // Get attributes
    const bool is_reverse = ctx.Attr<bool>("is_reverse");
    const bool origin_mode = ctx.Attr<bool>("origin_mode");

    // Get tensor dimensions
-    const auto x_dims = framework::vectorize(input->dims());
+    const auto x_mat_dims_vec = framework::vectorize(x_mat_dims);
    const auto weight_h_dims = framework::vectorize(weight_h->dims());
    const auto& input_lod = input->lod()[0];

@@ -384,15 +444,17 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
          }
          return res;
        }();
-    const int64_t IC = x_dims[1];         // Input channels
-    const int64_t OC = weight_h_dims[0];  // Output channels
+    const int64_t IC = x_mat_dims_vec[1];  // Input channels
+    const int64_t OC = weight_h_dims[0];   // Output channels

-    GRUMKLDNNHandler<T> handler(ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(),
-                                input, weight_h, h0, is_reverse, N, Ti, IC, OC,
-                                ctx.InputName("X") + ctx.InputName("WeightH"));
+    GRUMKLDNNHandler<T, Tout> handler(
+        ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, weight_h, h0,
+        is_reverse, N, Ti, IC, OC,
+        ctx.InputName("X") + ctx.InputName("WeightH"));

    auto input_memory_p =
        handler.AcquireInputMemoryWithReorder(input, is_reverse);
+    auto h0_memory_p = handler.AcquireH0Memory(h0);
    auto weight_x_memory_p =
        handler.AcquireWeightXMemory(weight_x, origin_mode);
    auto weight_h_memory_p =
@@ -402,25 +464,21 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {

    std::unordered_map<int, dnnl::memory> gru_args = {
        {DNNL_ARG_SRC_LAYER, *input_memory_p},
+        {DNNL_ARG_SRC_ITER, *h0_memory_p},
        {DNNL_ARG_WEIGHTS_LAYER, *weight_x_memory_p},
        {DNNL_ARG_WEIGHTS_ITER, *weight_h_memory_p},
        {DNNL_ARG_BIAS, *bias_memory_p},
        {DNNL_ARG_DST_LAYER, *hidden_onednn_memory_p}};

-    if (h0) {
-      auto h0_memory_p = handler.AcquireH0Memory(h0);
-      gru_args.insert({DNNL_ARG_SRC_ITER, *h0_memory_p});
-    }
-
    auto gru_forward_p = handler.AcquireForwardPrimitive();

    dnnl::stream astream(mkldnn_engine);
    gru_forward_p->execute(astream, gru_args);
    astream.wait();

-    auto* hidden_onednn_data =
-        reinterpret_cast<T*>(hidden_onednn_memory_p->get_data_handle());
-    auto* hidden_data = hidden->mutable_data<T>(ctx.GetPlace());
+    auto* hidden_onednn_data = hidden_onednn_memory_p->get_data_handle();
+    auto* hidden_data =
+        to_void_cast(hidden->mutable_data<Tout>(ctx.GetPlace()));
    if (handler.is_NTC()) {
      handler.reorderRNNdata(hidden_onednn_data, hidden_data, input_lod,
                             is_reverse, platform::RNNReorderType::NTC_PP);
@@ -436,4 +494,5 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {

 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(fusion_gru, MKLDNN, paddle::platform::CPUPlace,
-                   ops::FusionGRUMKLDNNKernel<float>);
+                   ops::FusionGRUMKLDNNKernel<float>,
+                   ops::FusionGRUMKLDNNKernel<uint8_t>);
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -34,8 +34,7 @@ class CPUGaussianRandomKernel : public framework::OpKernel<T> {
    auto* tensor = context.Output<framework::Tensor>("Out");

    std::normal_distribution<T> dist(mean, std);
-    const std::string op_type = "gaussian_random";
-    auto shape = GetShape(context, op_type);
+    auto shape = GetShape(context);
    tensor->Resize(shape);
    int64_t size = tensor->numel();
    T* data = tensor->mutable_data<T>(context.GetPlace());

--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -58,8 +58,7 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
    T mean = static_cast<T>(context.Attr<float>("mean"));
    T std = static_cast<T>(context.Attr<float>("std"));
    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
-    const std::string op_type = "gaussian_random";
-    auto shape = GetShape(context, op_type);
+    auto shape = GetShape(context);
    tensor->Resize(shape);
    T* data = tensor->mutable_data<T>(context.GetPlace());


--- a/paddle/fluid/operators/lite/lite_engine_op.h
+++ b/paddle/fluid/operators/lite/lite_engine_op.h
@@ -39,7 +39,7 @@ class LiteEngineOp : public framework::OperatorBase {
 private:
  std::vector<std::string> in_names_;
  std::vector<std::string> out_names_;
-  paddle::lite::Predictor *engine_;
+  paddle::lite_api::PaddlePredictor *engine_;
  framework::proto::VarType::Type precision_;
  bool use_gpu_;
  bool zero_copy_;
@@ -78,10 +78,10 @@ class LiteEngineOp : public framework::OperatorBase {
      framework::LoDTensor src_t =
          inference::analysis::GetFromScope<framework::LoDTensor>(scope,
                                                                  in_names_[i]);
-      paddle::lite::Tensor *dst_t = engine_->GetInput(i);
+      paddle::lite_api::Tensor dst_t = *(engine_->GetInput(i));
      VLOG(3) << "== fluid -> lite (" << in_names_[i] << " -> "
              << engine_->GetInputNames()[i] << ")";
-      inference::lite::utils::TensorCopy(dst_t, &src_t, *ctx, zero_copy_);
+      inference::lite::utils::TensorCopy(&dst_t, &src_t, *ctx, zero_copy_);
    }
 #ifdef PADDLE_WITH_CUDA
    if (platform::is_gpu_place(dev_place)) {
@@ -93,7 +93,7 @@ class LiteEngineOp : public framework::OperatorBase {
    engine_->Run();
    VLOG(3) << "lite engine run done";
    for (size_t i = 0; i < out_names_.size(); i++) {
-      paddle::lite::Tensor src_t = *(engine_->GetOutput(i));
+      paddle::lite_api::Tensor src_t = *(engine_->GetOutput(i));
      framework::LoDTensor *dst_t =
          &inference::analysis::GetFromScope<framework::LoDTensor>(
              scope, out_names_[i]);

--- a/paddle/fluid/operators/lite/lite_engine_op_test.cc
+++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc
@@ -84,10 +84,10 @@ TEST(LiteEngineOp, engine_op) {
  inference::lite::EngineConfig config;
  config.valid_places = {
 #ifdef PADDLE_WITH_CUDA
-      paddle::lite::Place({TARGET(kCUDA), PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kCUDA), PRECISION(kFloat)}),
 #endif
-      paddle::lite::Place({TARGET(kHost), PRECISION(kAny)}),
-      paddle::lite::Place({TARGET(kX86), PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}),
  };
  serialize_params(&(config.param), &scope, repetitive_params);
  config.model = program.Proto()->SerializeAsString();

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -9,7 +9,11 @@ function(math_library TARGET)
    set(hip_srcs)
    set(math_common_deps device_context framework_proto enforce)
    if (WITH_GPU)
-        list(APPEND math_common_deps cub)
+        if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)	
+            list(APPEND math_common_deps cub)
+	else()
+            list(APPEND math_common_deps)
+	endif()
    endif()
    set(multiValueArgs DEPS)
    cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"

--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -128,9 +128,23 @@ struct RowwiseAdd<platform::CPUDeviceContext, T> {
                  const framework::Tensor& input,
                  const framework::Tensor& vector, framework::Tensor* output) {
    auto in_dims = input.dims();
+    auto out_dims = output->dims();
    auto size = input.numel() / in_dims[0];
-    PADDLE_ENFORCE_EQ(vector.numel(), size);
-    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+    PADDLE_ENFORCE_EQ(
+        vector.numel(), size,
+        platform::errors::InvalidArgument(
+            "The input vector size"
+            " should be equal to the size of each row of input tensor."
+            " Expected vector size=%d, but received %d",
+            size, vector.numel()));
+    const char* in_dims_cstr = in_dims.to_str().c_str();
+    const char* out_dims_cstr = out_dims.to_str().c_str();
+    PADDLE_ENFORCE_EQ(out_dims, in_dims,
+                      platform::errors::InvalidArgument(
+                          "The output tensor shape should be same as the input"
+                          " tensor shape. Expected output tensor shape: %s,"
+                          " but received %s",
+                          in_dims_cstr, out_dims_cstr));

    auto in = framework::EigenMatrix<T>::From(input);
    auto vec = framework::EigenVector<T>::Flatten(vector);

--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -88,9 +88,24 @@ struct RowwiseAdd<platform::CUDADeviceContext, T> {
                  const framework::Tensor& input,
                  const framework::Tensor& vector, framework::Tensor* output) {
    auto in_dims = input.dims();
+    auto out_dims = output->dims();
    auto size = input.numel() / in_dims[0];
-    PADDLE_ENFORCE_EQ(vector.numel(), size);
-    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+    PADDLE_ENFORCE_EQ(
+        vector.numel(), size,
+        platform::errors::InvalidArgument(
+            "The input vector size"
+            " should be equal to the size of each row of input tensor."
+            " Expected vector size=%d, but received %d",
+            size, vector.numel()));
+    const char* in_dims_cstr = in_dims.to_str().c_str();
+    const char* out_dims_cstr = out_dims.to_str().c_str();
+    PADDLE_ENFORCE_EQ(
+        out_dims, in_dims,
+        platform::errors::InvalidArgument(
+            "The output tensor shape should be same as the input tensor"
+            " shape. Expected output tensor shape: %s,"
+            " but received %s",
+            in_dims_cstr, out_dims_cstr));
    int blocks = 512;
    int grids = (input.numel() + blocks - 1) / blocks;
    RowwiseAddKernel<T><<<grids, blocks, 0, context.stream()>>>(
@@ -113,7 +128,12 @@ void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
    framework::Tensor* vector) {
  auto in_dims = input.dims();
  auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector->numel(), size);
+  PADDLE_ENFORCE_EQ(vector->numel(), size,
+                    platform::errors::InvalidArgument(
+                        "The size of input vector"
+                        " should be equal to the size of input tensor column"
+                        " dimension. Expected vector size=%d, but received %d",
+                        size, vector->numel()));
  framework::Tensor one;
  one.mutable_data<double>({in_dims[0]}, context.GetPlace());
  SetConstant<platform::CUDADeviceContext, double> set;
@@ -134,7 +154,12 @@ void RowwiseSum<platform::CUDADeviceContext, double>::operator()(
    framework::Tensor* vector) {
  auto in_dims = input.dims();
  auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0]);
+  PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0],
+                    platform::errors::InvalidArgument(
+                        "The size of input vector"
+                        " should be equal to the size of input tensor row"
+                        " dimension. Expected vector size=%d, but received %d",
+                        in_dims[0], vector->numel()));
  framework::Tensor one;
  one.mutable_data<double>({size}, context.GetPlace());
  SetConstant<platform::CUDADeviceContext, double> set;

--- a/paddle/fluid/operators/math/math_function_impl.h
+++ b/paddle/fluid/operators/math/math_function_impl.h
@@ -59,7 +59,12 @@ void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
                                              framework::Tensor* out) {
  auto in_dims = input.dims();
  auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(out->numel(), size);
+  PADDLE_ENFORCE_EQ(out->numel(), size,
+                    platform::errors::InvalidArgument(
+                        "The size of output tensor "
+                        "should be equal to the size of input tensor column"
+                        " dimension. Expected output size=%d, but received %d",
+                        size, out->numel()));

  auto in = framework::EigenMatrix<T>::From(input);
  auto vec = framework::EigenVector<T>::Flatten(*out);
@@ -78,7 +83,13 @@ class ColwiseSum<platform::CPUDeviceContext, T> {
    auto& in_dims = input.dims();
    auto height = in_dims[0];
    auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), size);
+    PADDLE_ENFORCE_EQ(
+        out->numel(), size,
+        platform::errors::InvalidArgument(
+            "The size of output tensor "
+            "should be equal to the size of input tensor column"
+            " dimension. Expected output size=%d, but received %d",
+            size, out->numel()));

    T* out_buf = out->mutable_data<T>(out->place());
    const T* in_buf = input.data<T>();
@@ -100,8 +111,16 @@ void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
                                               const framework::Tensor& input,
                                               framework::Tensor* out) {
  auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
-  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
+  PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
+                                            "The rank of input tensor "
+                                            "should be 2, but received %d",
+                                            in_dims.size()));
+  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0],
+                    platform::errors::InvalidArgument(
+                        "The size of output tensor "
+                        "should be equal to the size of input tensor row"
+                        " dimension. Expected output size=%d, but received %d",
+                        in_dims[0], out->numel()));

  auto in = framework::EigenMatrix<T>::From(input);
  auto vec = framework::EigenVector<T>::Flatten(*out);
@@ -118,10 +137,19 @@ class RowwiseMean<platform::CPUDeviceContext, T> {
  void operator()(const platform::CPUDeviceContext& context,
                  const framework::Tensor& input, framework::Tensor* out) {
    auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
+                                              "The rank of input tensor "
+                                              "should be 2, but received %d",
+                                              in_dims.size()));
    auto height = in_dims[0];
    auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), height);
+    PADDLE_ENFORCE_EQ(
+        out->numel(), height,
+        platform::errors::InvalidArgument(
+            "The size of output tensor "
+            "should be equal to the size of input tensor row"
+            " dimension. Expected output size=%d, but received %d",
+            height, out->numel()));
    auto inv_size = 1.0 / size;
    T* out_buf = out->mutable_data<T>(out->place());
    const T* in_buf = input.data<T>();
@@ -141,8 +169,16 @@ void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
                                              const framework::Tensor& input,
                                              framework::Tensor* out) {
  auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
-  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
+  PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
+                                            "The rank of input tensor "
+                                            "should be 2, but received %d",
+                                            in_dims.size()));
+  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0],
+                    platform::errors::InvalidArgument(
+                        "The size of output tensor "
+                        "should be equal to the size of input tensor row"
+                        " dimension. Expected output size=%d, but received %d",
+                        in_dims[0], out->numel()));

  auto in = framework::EigenMatrix<T>::From(input);
  auto vec = framework::EigenVector<T>::Flatten(*out);
@@ -159,10 +195,19 @@ class RowwiseSum<platform::CPUDeviceContext, T> {
  void operator()(const platform::CPUDeviceContext& context,
                  const framework::Tensor& input, framework::Tensor* out) {
    auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
+                                              "The rank of input tensor "
+                                              "should be 2, but received %d",
+                                              in_dims.size()));
    auto height = in_dims[0];
    auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), height);
+    PADDLE_ENFORCE_EQ(
+        out->numel(), height,
+        platform::errors::InvalidArgument(
+            "The size of output tensor "
+            "should be equal to the size of input tensor row"
+            " dimension. Expected output size=%d, but received %d",
+            height, out->numel()));

    T* out_buf = out->mutable_data<T>(out->place());
    const T* in_buf = input.data<T>();

--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -224,7 +224,11 @@ TEST(math_funciton, set_constant) {
  auto* ctx = new paddle::platform::CPUDeviceContext();
  paddle::operators::math::set_constant(*ctx, &t, 10);
  for (int64_t i = 0; i < t.numel(); ++i) {
-    PADDLE_ENFORCE_EQ(10, t.data<int>()[i]);
+    PADDLE_ENFORCE_EQ(10, t.data<int>()[i],
+                      paddle::platform::errors::InvalidArgument(
+                          "Each value of input"
+                          "tensor should be 10, but received %d.",
+                          t.data<int>()[i]));
  }
  delete ctx;
 }

--- a/paddle/fluid/operators/math/math_function_test.cu
+++ b/paddle/fluid/operators/math/math_function_test.cu
@@ -18,7 +18,12 @@

 void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size,
                    const std::vector<float>& data) {
-  PADDLE_ENFORCE_EQ(size, data.size());
+  PADDLE_ENFORCE_EQ(
+      size, data.size(),
+      paddle::platform::errors::InvalidArgument(
+          "The size of argument data should"
+          " be equal to the argument size. Expected %d, but received %d.",
+          size, data.size()));
  for (size_t i = 0; i < data.size(); ++i) {
    in_ptr[i] = paddle::platform::float16(data[i]);
  }

--- a/paddle/fluid/operators/math/padding.h
+++ b/paddle/fluid/operators/math/padding.h
@@ -85,8 +85,9 @@ void PaddingFunctor(int rank, const framework::ExecutionContext& context,
      PadFunction<DeviceContext, T, 6>(context, pads, src, pad_value, out);
      break;
    default:
-      PADDLE_THROW(
-          "PadOp only support tensors with no more than 6 dimensions.");
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "PadOp only support tensors with no more"
+          " than 6 dimensions currently."));
  }
 }

@@ -114,8 +115,9 @@ void PaddingGradFunctor(int rank, const framework::ExecutionContext& context,
      PadGradFunction<DeviceContext, T, 6>(context, pads, src, out);
      break;
    default:
-      PADDLE_THROW(
-          "PadOp only support tensors with no more than 6 dimensions.");
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "PadOp only support tensors with no more"
+          " than 6 dimensions currently."));
  }
 }


--- a/paddle/fluid/operators/math/sampler.h
+++ b/paddle/fluid/operators/math/sampler.h
@@ -19,6 +19,8 @@ limitations under the License. */
 #include <random>
 #include <vector>

+#include "paddle/fluid/platform/enforce.h"
+
 namespace paddle {
 namespace operators {
 namespace math {
@@ -31,7 +33,10 @@ namespace math {
 class Sampler {
 public:
  explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) {
-    //    PADDLE_ENFORCE_GT(range, 0, "Range should be greater than 0.");
+    PADDLE_ENFORCE_GT(range, 0, platform::errors::InvalidArgument(
+                                    "Range should be"
+                                    " greater than 0, but recevied %d.",
+                                    range));
    if (seed == 0) {
      std::random_device r;
      seed_ = r();

--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -29,7 +29,12 @@ struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
                  const framework::SelectedRows& input2,
                  framework::SelectedRows* output) {
    auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2.height());
+    PADDLE_ENFORCE_EQ(
+        in1_height, input2.height(),
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height  = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, input2.height()));
    output->set_height(in1_height);

    auto& in1_rows = input1.rows();
@@ -47,15 +52,31 @@ struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
    auto& in2_value = input2.value();

    auto in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
-    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, in2_value.numel() / in2_rows.size(),
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, in2_value.numel() / in2_rows.size()));
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, out_value->numel() / out_rows.size(),
+        platform::errors::InvalidArgument(
+            "The input and oupput width must be equal."
+            "But recieved input width = [%d], output width = [%d]",
+            in1_row_numel, out_value->numel() / out_rows.size()));

    auto in1_place = input1.place();
-    PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
+    PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the CPU place."));
    auto in2_place = input2.place();
-    PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
+    PADDLE_ENFORCE_EQ(platform::is_cpu_place(in2_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the CPU place."));
    auto out_place = context.GetPlace();
-    PADDLE_ENFORCE(platform::is_cpu_place(out_place));
+    PADDLE_ENFORCE_EQ(platform::is_cpu_place(out_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the CPU place."));

    auto* out_data = out_value->data<T>();
    auto* in1_data = in1_value.data<T>();
@@ -82,15 +103,35 @@ struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
    auto in1_height = input1.height();
    auto in2_dims = input2.dims();
    auto out_dims = output->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
-    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, in2_dims[0]));
+    PADDLE_ENFORCE_EQ(
+        in1_height, out_dims[0],
+        platform::errors::InvalidArgument(
+            "The input and output height must be equal."
+            "But recieved input height = [%d], output height = [%d]",
+            in1_height, out_dims[0]));

    auto& in1_value = input1.value();
    auto& in1_rows = input1.rows();

    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
-    PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2.numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2.numel() / in1_height));
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, output->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The input and output width must be equal."
+            "But recieved input width = [%d], output width = [%d]",
+            in1_row_numel, output->numel() / in1_height));

    SetConstant<platform::CPUDeviceContext, T> functor;
    functor(context, output, 0.0);
@@ -121,7 +162,12 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
                  const int64_t input2_offset,
                  framework::SelectedRows* input2) {
    auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2->height());
+    PADDLE_ENFORCE_EQ(
+        in1_height, input2->height(),
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, input2->height()));

    auto& in1_rows = input1.rows();
    auto& in2_rows = *(input2->mutable_rows());
@@ -133,9 +179,13 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
    in2_rows.Extend(in1_rows.begin(), in1_rows.end());

    auto in1_place = input1.place();
-    PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
+    PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the CPU place."));
    auto in2_place = input2->place();
-    PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
+    PADDLE_ENFORCE_EQ(platform::is_cpu_place(in2_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the CPU place."));

    auto* in1_data = in1_value.data<T>();
    auto* in2_data = in2_value->data<T>();
@@ -163,7 +213,12 @@ struct SelectedRowsSumTo<platform::CPUDeviceContext, T> {
      auto& in_rows = (*iter)->rows();
      size += in_rows.end() - in_rows.begin();
      auto in1_height = (*iter)->height();
-      PADDLE_ENFORCE_EQ(in1_height, input2->height());
+      PADDLE_ENFORCE_EQ(in1_height, input2->height(),
+                        platform::errors::InvalidArgument(
+                            "The two inputs height must be equal."
+                            "But recieved first input height = [%d], second "
+                            "input height = [%d]",
+                            in1_height, input2->height()));
    }
    // concat rows
    std::vector<int64_t> in2_rows;
@@ -201,13 +256,23 @@ struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
    }
    auto in1_height = input1.height();
    auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, in2_dims[0]));

    auto& in1_value = input1.value();
    auto& in1_rows = input1.rows();

    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2->numel() / in1_height));

    auto* in1_data = in1_value.data<T>();
    auto* input2_data = input2->data<T>();
@@ -302,10 +367,12 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
        continue;
      }
      PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
-                        "all input should have same "
-                        "dimension except for the first one");
+                        platform::errors::InvalidArgument(
+                            "All inputs should have same "
+                            "dimension except for the first one."));
      PADDLE_ENFORCE_EQ(input_height, input->height(),
-                        "all input should have same height");
+                        platform::errors::InvalidArgument(
+                            "All inputs should have same height."));
      row_num += input->rows().size();
      merged_row_set.insert(input->rows().begin(), input->rows().end());
    }
@@ -421,10 +488,12 @@ struct MergeAverage<platform::CPUDeviceContext, T> {
        continue;
      }
      PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
-                        "all input should have same "
-                        "dimension except for the first one");
+                        platform::errors::InvalidArgument(
+                            "All inputs should have same "
+                            "dimension except for the first one."));
      PADDLE_ENFORCE_EQ(input_height, input->height(),
-                        "all input should have same height");
+                        platform::errors::InvalidArgument(
+                            "All input should have same height."));
      row_num += input->rows().size();
      merged_row_set.insert(input->rows().begin(), input->rows().end());
    }
@@ -492,13 +561,23 @@ struct UpdateToTensor<platform::CPUDeviceContext, T> {
                  framework::Tensor* input2) {
    auto in1_height = input1.height();
    auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, in2_dims[0]));

    auto& in1_value = input1.value();
    auto& in1_rows = input1.rows();

    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2->numel() / in1_height));

    auto* in1_data = in1_value.data<T>();
    auto* input2_data = input2->data<T>();

--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -30,7 +30,12 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
                  const framework::SelectedRows& input2,
                  framework::SelectedRows* output) {
    auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2.height());
+    PADDLE_ENFORCE_EQ(
+        in1_height, input2.height(),
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height  = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, input2.height()));
    output->set_height(in1_height);

    framework::Vector<int64_t> in1_rows(input1.rows());
@@ -48,18 +53,34 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
    auto& in2_value = input2.value();

    auto in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
-    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, in2_value.numel() / in2_rows.size(),
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, in2_value.numel() / in2_rows.size()));
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, out_value->numel() / out_rows.size(),
+        platform::errors::InvalidArgument(
+            "The input and oupput width must be equal."
+            "But recieved input width = [%d], output width = [%d]",
+            in1_row_numel, out_value->numel() / out_rows.size()));

    auto* out_data = out_value->data<T>();
    auto* in1_data = in1_value.data<T>();

    auto in1_place = input1.place();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true);
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the GPU place."));
    auto in2_place = input2.place();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true);
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the GPU place."));
    auto out_place = context.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_place), true);
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the GPU place."));

    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, out_place), out_data,
                 BOOST_GET_CONST(platform::CUDAPlace, in1_place), in1_data,
@@ -104,15 +125,35 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
    auto in1_height = input1.height();
    auto in2_dims = input2.dims();
    auto out_dims = output->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
-    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument(
+            "The two inputs height must be equal."
+            "But recieved first input height = [%d], first input height = [%d]",
+            in1_height, in2_dims[0]));
+    PADDLE_ENFORCE_EQ(
+        in1_height, out_dims[0],
+        platform::errors::InvalidArgument(
+            "The input and output height must be equal."
+            "But recieved input height = [%d], output height = [%d]",
+            in1_height, out_dims[0]));

    auto& in1_value = input1.value();
    auto& in1_rows = input1.rows();

    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
-    PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2.numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2.numel() / in1_height));
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, output->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The input and output width must be equal."
+            "But recieved input width = [%d], output width = [%d]",
+            in1_row_numel, output->numel() / in1_height));

    auto* in1_data = in1_value.data<T>();
    auto* in2_data = input2.data<T>();
@@ -148,7 +189,12 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
                  const int64_t input2_offset,
                  framework::SelectedRows* input2) {
    auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2->height());
+    PADDLE_ENFORCE_EQ(
+        in1_height, input2->height(),
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, input2->height()));

    auto& in1_rows = input1.rows();
    auto& in2_rows = *(input2->mutable_rows());
@@ -162,9 +208,13 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
    }

    auto in1_place = input1.place();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true);
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the GPU place."));
    auto in2_place = input2->place();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true);
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the GPU place."));

    auto* in1_data = in1_value.data<T>();
    auto* in2_data = in2_value->data<T>();
@@ -209,13 +259,23 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
                  framework::Tensor* input2) {
    auto in1_height = input1.height();
    auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, in2_dims[0]));

    auto& in1_value = input1.value();
    auto& in1_rows = input1.rows();

    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2->numel() / in1_height));

    auto* in1_data = in1_value.data<T>();
    auto* in2_data = input2->data<T>();
@@ -340,10 +400,12 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
        continue;
      }
      PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
-                        "all input should have same "
-                        "dimension except for the first one");
+                        platform::errors::InvalidArgument(
+                            "All input should have same "
+                            "dimension except for the first one."));
      PADDLE_ENFORCE_EQ(input_height, input->height(),
-                        "all input should have same height");
+                        platform::errors::InvalidArgument(
+                            "All input should have same height."));
      merged_row_set.insert(input->rows().begin(), input->rows().end());
    }
    std::vector<int64_t> merge_rows_cpu(merged_row_set.begin(),
@@ -448,13 +510,23 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {

    auto in1_height = merged_in1.height();
    auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, in2_dims[0]));

    auto& in1_value = merged_in1.value();
    auto& in1_rows = merged_in1.rows();

    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2->numel() / in1_height));

    auto* in1_data = in1_value.template data<T>();
    auto* in2_data = input2->data<T>();

--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
@@ -38,7 +38,9 @@ TEST(selected_rows_functor, gpu_add) {
          {static_cast<int64_t>(rows1.size()), row_numel}),
      gpu_place);
  functor(ctx, in1_value, 1.0);
-  PADDLE_ENFORCE(cudaDeviceSynchronize());
+  PADDLE_ENFORCE_EQ(cudaDeviceSynchronize(), 0,
+                    paddle::platform::errors::PreconditionNotMet(
+                        "The all synchronization on the cuda is error!"));

  std::vector<int64_t> rows2{0, 5, 7, 9};
  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{

--- a/paddle/fluid/operators/math/vol2col.cc
+++ b/paddle/fluid/operators/math/vol2col.cc
@@ -34,10 +34,16 @@ class Vol2ColFunctor<platform::CPUDeviceContext, T> {
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, framework::Tensor* col,
                  const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(vol.dims().size(), 4,
-                      "The dimension of vol should be 4.");
-    PADDLE_ENFORCE_EQ(col->dims().size(), 7,
-                      "The dimension of col should be 7.");
+    PADDLE_ENFORCE_EQ(
+        vol.dims().size(), 4,
+        platform::errors::InvalidArgument("The dimension of"
+                                          " vol should be 4, but received %d.",
+                                          vol.dims().size()));
+    PADDLE_ENFORCE_EQ(
+        col->dims().size(), 7,
+        platform::errors::InvalidArgument("The dimension of"
+                                          "col should be 7, but received %d.",
+                                          col->dims().size()));

    int input_channels =
        (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
@@ -65,27 +71,33 @@ class Vol2ColFunctor<platform::CPUDeviceContext, T> {
    int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
    int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];

-    PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
+    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
+                            ((dilations[0] * (filter_depth - 1) + 1))) /
+                               strides[0] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_depth_tmp, output_depth,
+        platform::errors::InvalidArgument(
+            "input_depth(%d) and output_depth(%d) are mismatching.",
+            input_depth_tmp, output_depth));
+    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
+                             ((dilations[1] * (filter_height - 1) + 1))) /
+                                strides[1] +
+                            1;
+    PADDLE_ENFORCE_EQ(
+        input_height_tmp, output_height,
+        platform::errors::InvalidArgument(
+            "input_height(%d) and output_height(%d) are mismatching.",
+            input_height_tmp, output_height));
+    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
+                            ((dilations[2] * (filter_width - 1) + 1))) /
+                               strides[2] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_width_tmp, output_width,
+        platform::errors::InvalidArgument(
+            "input_width(%d) and output_width(%d) are mismatching.",
+            input_width_tmp, output_width));
    const T* vol_data = vol.data<T>();
    T* col_data = col->data<T>();

@@ -140,10 +152,16 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> {
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, framework::Tensor* vol,
                  const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(vol->dims().size(), 4,
-                      "The dimension of vol should be 4.");
-    PADDLE_ENFORCE_EQ(col.dims().size(), 7,
-                      "The dimension of col should be 7.");
+    PADDLE_ENFORCE_EQ(
+        vol->dims().size(), 4,
+        platform::errors::InvalidArgument("The dimension of vol"
+                                          " should be 4, but received %d.",
+                                          vol->dims().size()));
+    PADDLE_ENFORCE_EQ(
+        col.dims().size(), 7,
+        platform::errors::InvalidArgument("The dimension of col"
+                                          " should be 7, but received %d.",
+                                          col.dims().size()));

    int input_channels =
        (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
@@ -170,27 +188,33 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> {
    int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
    int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];

-    PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
+    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
+                            ((dilations[0] * (filter_depth - 1) + 1))) /
+                               strides[0] +
+                           1;
+    PADDLE_ENFORCE_EQ(input_depth_tmp, output_depth,
+                      platform::errors::InvalidArgument(
+                          "input_depth(%d)"
+                          " and output_depth(%d) are mismatching.",
+                          input_depth_tmp, output_depth));
+    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
+                             ((dilations[1] * (filter_height - 1) + 1))) /
+                                strides[1] +
+                            1;
+    PADDLE_ENFORCE_EQ(input_height_tmp, output_height,
+                      platform::errors::InvalidArgument(
+                          "input_height(%d)"
+                          " and output_height(%d) are mismatching.",
+                          input_height_tmp, output_height));
+    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
+                            ((dilations[2] * (filter_width - 1) + 1))) /
+                               strides[2] +
+                           1;
+    PADDLE_ENFORCE_EQ(input_width_tmp, output_width,
+                      platform::errors::InvalidArgument(
+                          "input_width(%d)"
+                          " and output_width(%d) are mismatching.",
+                          input_width_tmp, output_width));
    T* vol_data = vol->data<T>();
    const T* col_data = col.data<T>();


--- a/paddle/fluid/operators/math/vol2col.cu
+++ b/paddle/fluid/operators/math/vol2col.cu
@@ -90,10 +90,16 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> {
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, framework::Tensor* col,
                  const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(vol.dims().size(), 4,
-                      "The dimension of vol should be 4.");
-    PADDLE_ENFORCE_EQ(col->dims().size(), 7,
-                      "The dimension of col should be 7.");
+    PADDLE_ENFORCE_EQ(
+        vol.dims().size(), 4,
+        platform::errors::InvalidArgument("The dimension of"
+                                          " vol should be 4, but received %d.",
+                                          vol.dims().size()));
+    PADDLE_ENFORCE_EQ(
+        col->dims().size(), 7,
+        platform::errors::InvalidArgument("The dimension of"
+                                          "col should be 7, but received %d.",
+                                          col->dims().size()));

    int input_channels =
        (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
@@ -117,27 +123,33 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> {
    int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
    int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
    int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
-    PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
+    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
+                            ((dilations[0] * (filter_depth - 1) + 1))) /
+                               strides[0] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_depth_tmp, output_depth,
+        platform::errors::InvalidArgument(
+            "input_depth(%d) and output_depth(%d) are mismatching.",
+            input_depth_tmp, output_depth));
+    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
+                             ((dilations[1] * (filter_height - 1) + 1))) /
+                                strides[1] +
+                            1;
+    PADDLE_ENFORCE_EQ(
+        input_height_tmp, output_height,
+        platform::errors::InvalidArgument(
+            "input_height(%d) and output_height(%d) are mismatching.",
+            input_height_tmp, output_height));
+    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
+                            ((dilations[2] * (filter_width - 1) + 1))) /
+                               strides[2] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_width_tmp, output_width,
+        platform::errors::InvalidArgument(
+            "input_width(%d) and output_width(%d) are mismatching.",
+            input_width_tmp, output_width));

    int num_outputs =
        input_channels * output_depth * output_height * output_width;
@@ -241,10 +253,16 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, framework::Tensor* vol,
                  const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(vol->dims().size(), 4,
-                      "The dimension of vol should be 4.");
-    PADDLE_ENFORCE_EQ(col.dims().size(), 7,
-                      "The dimension of col should be 7.");
+    PADDLE_ENFORCE_EQ(
+        vol->dims().size(), 4,
+        platform::errors::InvalidArgument("The dimension of vol"
+                                          " should be 4, but received %d.",
+                                          vol->dims().size()));
+    PADDLE_ENFORCE_EQ(
+        col.dims().size(), 7,
+        platform::errors::InvalidArgument("The dimension of col"
+                                          " should be 7, but received %d.",
+                                          col.dims().size()));

    int input_channels =
        (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
@@ -269,27 +287,33 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
    int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
    int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];

-    PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
+    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
+                            ((dilations[0] * (filter_depth - 1) + 1))) /
+                               strides[0] +
+                           1;
+    PADDLE_ENFORCE_EQ(input_depth_tmp, output_depth,
+                      platform::errors::InvalidArgument(
+                          "input_depth(%d)"
+                          " and output_depth(%d) are mismatching.",
+                          input_depth_tmp, output_depth));
+    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
+                             ((dilations[1] * (filter_height - 1) + 1))) /
+                                strides[1] +
+                            1;
+    PADDLE_ENFORCE_EQ(input_height_tmp, output_height,
+                      platform::errors::InvalidArgument(
+                          "input_height(%d)"
+                          " and output_height(%d) are mismatching.",
+                          input_height_tmp, output_height));
+    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
+                            ((dilations[2] * (filter_width - 1) + 1))) /
+                               strides[2] +
+                           1;
+    PADDLE_ENFORCE_EQ(input_width_tmp, output_width,
+                      platform::errors::InvalidArgument(
+                          "input_width(%d)"
+                          " and output_width(%d) are mismatching.",
+                          input_width_tmp, output_width));

    int num_kernels = input_channels * input_depth * input_height * input_width;


--- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
@@ -30,8 +30,7 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
    float std = context.Attr<float>("std");
    auto* tensor = context.Output<framework::Tensor>("Out");

-    const std::string op_type = "gaussian_random";
-    auto shape = GetShape(context, op_type);
+    auto shape = GetShape(context);
    tensor->Resize(shape);
    T* data = tensor->mutable_data<T>(context.GetPlace());
    int64_t size = tensor->numel();

--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cc
@@ -48,6 +48,9 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<float>("lars_weight_decay",
                   "(float, default 0.0005) LARS weight decay")
        .SetDefault(0.0005);
+    AddAttr<float>("epsilon",
+                   "(float, default 0.0) epsilon to avoid Division by Zero.")
+        .SetDefault(0.0);

    AddComment(R"DOC(
 Lars Momentum Optimizer.

--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
@@ -23,14 +23,16 @@ __global__ void MomentumLarsKernel(const T* p, const T* g, const T* v,
                                   const T* learning_rate, const T mu,
                                   const int64_t num, const T lars_coeff,
                                   const T lars_weight_decay, const T* p_norm,
-                                   const T* g_norm, T* p_out, T* v_out) {
+                                   const T* g_norm, T* p_out, T* v_out,
+                                   const T epsilon) {
  T lr = learning_rate[0];
  T local_lr = learning_rate[0];
  CUDA_KERNEL_LOOP(i, num) {
-    if (p_norm[0] > 0 && g_norm[0] > 0) {
+    if (lars_weight_decay > 0 && p_norm[0] > 0 && g_norm[0] > 0) {
      local_lr = lr * lars_coeff * p_norm[0] /
-                 (g_norm[0] + lars_weight_decay * p_norm[0]);
+                 (g_norm[0] + lars_weight_decay * p_norm[0] + epsilon);
    }
+
    T v_new = v[i] * mu + local_lr * (g[i] + lars_weight_decay * p[i]);
    v_out[i] = v_new;
    p_out[i] = p[i] - v_new;
@@ -54,6 +56,7 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
    T mu = static_cast<T>(ctx.Attr<float>("mu"));
    T lars_coeff = ctx.Attr<float>("lars_coeff");
    T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
+    T epsilon = ctx.Attr<float>("epsilon");

    auto* p = param->data<T>();
    auto* v = velocity->data<T>();
@@ -79,7 +82,7 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
    eg_norm.device(*place) = eigen_g.square().sum().sqrt();
    MomentumLarsKernel<<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
        p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay,
-        p_norm_data, g_norm_data, p_out, v_out);
+        p_norm_data, g_norm_data, p_out, v_out, epsilon);
  }
 };


--- a/paddle/fluid/operators/optimizers/lars_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.h
@@ -39,6 +39,7 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
    T mu = static_cast<T>(ctx.Attr<float>("mu"));
    T lars_coeff = ctx.Attr<float>("lars_coeff");
    T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
+    T epsilon = ctx.Attr<float>("epsilon");

    auto p_out = framework::EigenVector<T>::Flatten(*param_out);
    auto v_out = framework::EigenVector<T>::Flatten(*velocity_out);
@@ -59,9 +60,9 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
    ep_norm = p.square().sum().sqrt();
    eg_norm = g.square().sum().sqrt();
    T local_lr = lr[0];
-    if (ep_norm(0) > 0 && eg_norm(0) > 0) {
+    if (lars_weight_decay > 0 && ep_norm(0) > 0 && eg_norm(0) > 0) {
      local_lr = lr[0] * lars_coeff * ep_norm(0) /
-                 (eg_norm(0) + lars_weight_decay * ep_norm(0));
+                 (eg_norm(0) + lars_weight_decay * ep_norm(0) + epsilon);
    }
    v_out = v * mu + local_lr * (g + lars_weight_decay * p);
    p_out = p - v_out;

--- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/reduce_ops/CMakeLists.txt
 include(operators)
 if(WITH_GPU)
-    register_operators(DEPS cub)
+    if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+        register_operators(DEPS cub)
+    else()
+        register_operators()
+    endif()
 else()
    register_operators()
 endif()
@@ -24,5 +28,9 @@ if(WITH_GPU)
 endif()

 if(WITH_GPU)
-    nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor cub)
+    if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+	nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor cub)
+    else()
+	nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor)
+    endif()
 endif()
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
@@ -13,18 +13,138 @@
 // limitations under the License.

 #include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
-#include <memory>
+#include <algorithm>
 #include <string>
-#include <utility>
 #include <vector>

 namespace paddle {
 namespace operators {

-class LogsumexpOpMaker : public ops::ReduceOpMaker {
- protected:
-  virtual std::string GetName() const { return "logsumexp"; }
-  virtual std::string GetOpType() const { return "Reduce logsumexp"; }
+class LogsumexpOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "logsumexp");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "logsumexp");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_rank = x_dims.size();
+    PADDLE_ENFORCE_LE(x_rank, 4,
+                      platform::errors::InvalidArgument(
+                          "The input tensor X's dimensions of logsumexp "
+                          "should be less equal than 4. But received X's "
+                          "dimensions = %d, X's shape = [%s].",
+                          x_rank, x_dims));
+    auto axis = ctx->Attrs().Get<std::vector<int>>("axis");
+    PADDLE_ENFORCE_GT(
+        axis.size(), 0,
+        platform::errors::InvalidArgument(
+            "The size of axis of logsumexp "
+            "should be greater than 0. But received the size of axis "
+            "of logsumexp is %d.",
+            axis.size()));
+
+    for (size_t i = 0; i < axis.size(); i++) {
+      PADDLE_ENFORCE_LT(
+          axis[i], x_rank,
+          platform::errors::InvalidArgument(
+              "axis[%d] should be in the "
+              "range [-dimension(X), dimension(X)] "
+              "where dimesion(X) is %d. But received axis[i] = %d.",
+              i, x_rank, axis[i]));
+      PADDLE_ENFORCE_GE(
+          axis[i], -x_rank,
+          platform::errors::InvalidArgument(
+              "axis[%d] should be in the "
+              "range [-dimension(X), dimension(X)] "
+              "where dimesion(X) is %d. But received axis[i] = %d.",
+              i, x_rank, axis[i]));
+      if (axis[i] < 0) {
+        axis[i] += x_rank;
+      }
+    }
+
+    bool keepdim = ctx->Attrs().Get<bool>("keepdim");
+    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
+    auto dims_vector = vectorize(x_dims);
+    if (reduce_all) {
+      if (keepdim)
+        ctx->SetOutputDim(
+            "Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
+      else
+        ctx->SetOutputDim("Out", {1});
+    } else {
+      auto dims_vector = vectorize(x_dims);
+      if (keepdim) {
+        for (size_t i = 0; i < axis.size(); ++i) {
+          dims_vector[axis[i]] = 1;
+        }
+      } else {
+        const int kDelFlag = -1;
+        for (size_t i = 0; i < axis.size(); ++i) {
+          dims_vector[axis[i]] = kDelFlag;
+        }
+        dims_vector.erase(
+            std::remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
+            dims_vector.end());
+      }
+      if (!keepdim && dims_vector.size() == 0) {
+        dims_vector.push_back(1);
+      }
+      auto out_dims = framework::make_ddim(dims_vector);
+      ctx->SetOutputDim("Out", out_dims);
+      if (axis.size() > 0 && axis[0] != 0) {
+        // Only pass LoD when not reducing on the first dim.
+        ctx->ShareLoD("X", /*->*/ "Out");
+      }
+    }
+  }
+};
+
+class LogsumexpOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor) The input tensor. Tensors with rank at most 4 are "
+             "supported.");
+    AddOutput("Out", "(Tensor) The result tensor.");
+    AddAttr<std::vector<int>>(
+        "axis",
+        "(list<int>, default {0}) The dimensions to reduce. "
+        "Must be in the range [-rank(input), rank(input)). "
+        "If `axis[i] < 0`, the axis[i] to reduce is `rank + axis[i]`. "
+        "Note that reducing on the first dim will make the LoD info lost.")
+        .SetDefault({0});
+    AddAttr<bool>("keepdim",
+                  "(bool, default false) "
+                  "If true, retain the reduced dimension with length 1.")
+        .SetDefault(false);
+    AddAttr<bool>("reduce_all",
+                  "(bool, default false) "
+                  "If true, output a scalar reduced along all dimensions.")
+        .SetDefault(false);
+    AddComment(string::Sprintf(R"DOC(
+logsumexp Operator.
+
+This operator computes the logsumexp of input tensor along the given axis.
+The result tensor has 1 fewer dimension than the input unless keep_dim is true.
+If reduce_all is true, just reduce along all dimensions and output a scalar.
+
+)DOC"));
+  }
+};
+
+class LogsumexpGrapOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "logsumexp");
+    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "logsumexp");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@GRAD", "logsumexp");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
 };

 template <typename T>
@@ -32,7 +152,6 @@ class LogsumexpGradOpMaker : public framework::SingleGradOpMaker<T> {
 public:
  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;

- protected:
  void Apply(GradOpPtr<T> op) const override {
    op->SetType("logsumexp_grad");
    op->SetInput("X", this->Input("X"));
@@ -46,18 +165,17 @@ class LogsumexpGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace operators
 }  // namespace paddle

-REGISTER_OPERATOR(logsumexp, ops::ReduceOp, ops::LogsumexpOpMaker,
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(logsumexp, ops::LogsumexpOp, ops::LogsumexpOpMaker,
                  ops::LogsumexpGradOpMaker<paddle::framework::OpDesc>,
                  ops::LogsumexpGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(logsumexp_grad, ops::ReduceGradOp);
+REGISTER_OPERATOR(logsumexp_grad, ops::LogsumexpGrapOp);

-REGISTER_OP_CPU_KERNEL(logsumexp,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         float, ops::LogsumexpFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         double, ops::LogsumexpFunctor>);
 REGISTER_OP_CPU_KERNEL(
-    logsumexp_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                          float, ops::LogsumexpGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
-                          ops::LogsumexpGradFunctor>);
+    logsumexp, ops::LogsumexpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LogsumexpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    logsumexp_grad,
+    ops::LogsumexpGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LogsumexpGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.cu
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cu
@@ -14,8 +14,8 @@

 #include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"

-REGISTER_OP_CUDA_KERNEL(logsumexp,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::LogsumexpFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          double, ops::LogsumexpFunctor>);
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    logsumexp, ops::LogsumexpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LogsumexpKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.h
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.h
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.part.cu
@@ -15,8 +15,9 @@
 // .part used to speed up nvcc compile
 #include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"

+namespace ops = paddle::operators;
+
 REGISTER_OP_CUDA_KERNEL(
-    logsumexp_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::LogsumexpGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::LogsumexpGradFunctor>);
+    logsumexp_grad,
+    ops::LogsumexpGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LogsumexpGradKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/run_program_op.cc
+++ b/paddle/fluid/operators/run_program_op.cc
--- a/paddle/fluid/operators/run_program_op.h
+++ b/paddle/fluid/operators/run_program_op.h
@@ -209,9 +209,14 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
    auto output_vars = ctx.MultiOutputVar("Out");

    auto input_var_names = ctx.InputNames("X");
-    auto param_names = ctx.InputNames("Params");
    auto output_var_names = ctx.OutputNames("Out");

+    // current program may not hold parameters
+    std::vector<std::string> param_names;
+    if (!param_vars.empty()) {
+      param_names = ctx.InputNames("Params");
+    }
+
    auto *block = ctx.Attr<BlockDesc *>("global_block");
    auto *program = block->Program();
    auto start_op_index = ctx.Attr<int64_t>("start_op_index");

--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
--- a/paddle/fluid/operators/tile_op.cc
+++ b/paddle/fluid/operators/tile_op.cc
--- a/paddle/fluid/operators/utils.h
+++ b/paddle/fluid/operators/utils.h
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
--- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
--- a/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
+++ b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
--- a/python/paddle/fluid/dygraph/amp/loss_scaler.py
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/error.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/error.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
--- a/python/paddle/fluid/tests/unittests/test_amp_check_finite_and_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/test_amp_check_finite_and_scale_op.py
--- a/python/paddle/fluid/tests/unittests/test_calc_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
--- a/python/paddle/fluid/tests/unittests/test_empty_op.py
+++ b/python/paddle/fluid/tests/unittests/test_empty_op.py
--- a/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py
+++ b/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
--- a/python/paddle/fluid/tests/unittests/test_logsumexp.py
+++ b/python/paddle/fluid/tests/unittests/test_logsumexp.py
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
--- a/python/paddle/fluid/tests/unittests/test_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py
--- a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
--- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
--- a/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py
+++ b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py
--- a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
--- a/tools/check_ut.py
+++ b/tools/check_ut.py
--- a/tools/enforce/count_all_enforce.sh
+++ b/tools/enforce/count_all_enforce.sh
--- a/tools/enforce/count_enforce_by_dir.sh
+++ b/tools/enforce/count_enforce_by_dir.sh