Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into all_new_design_exec

bc8a8042 · phlrain · e508460d · 022198c5 · bc8a8042 · bc8a8042
278 changed file
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -69,15 +69,21 @@ if(NOT DEFINED CBLAS_PROVIDER)
    PATHS ${OPENBLAS_LIB_SEARCH_PATHS})
  if(OPENBLAS_LAPACKE_INC_DIR AND OPENBLAS_INC_DIR AND OPENBLAS_LIB)
-    set(CBLAS_PROVIDER OPENBLAS)
+    file(READ "${OPENBLAS_INC_DIR}/openblas_config.h" config_file)
-    set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR})
+    string(REGEX MATCH "OpenBLAS ([0-9]+\.[0-9]+\.[0-9]+)" tmp ${config_file})
-    set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
+    string(REGEX MATCH "([0-9]+\.[0-9]+\.[0-9]+)" ver ${tmp})
-    add_definitions(-DPADDLE_USE_OPENBLAS)
+    if (${ver} VERSION_EQUAL "0.3.7")
-    add_definitions(-DLAPACK_FOUND)
+      set(CBLAS_PROVIDER OPENBLAS)
+      set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR})
-    message(STATUS "Found OpenBLAS (include: ${OPENBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+      set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
-    message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})")
+      add_definitions(-DPADDLE_USE_OPENBLAS)
+      add_definitions(-DLAPACK_FOUND)
+      message(STATUS "Found OpenBLAS (include: ${OPENBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+      message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})")
+    endif()
  endif()
 endif()

--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -20,7 +20,7 @@ SET(MKLDNN_SOURCE_DIR     ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 SET(MKLDNN_REPOSITORY     ${GIT_URL}/oneapi-src/oneDNN.git)
-SET(MKLDNN_TAG            f3999b71d8e4415c1985a0dfb812a3ed77ee21fa)
+SET(MKLDNN_TAG            748528a2d3204b5f401c14a9aacdec16accd5ead)
 # Introduce variables:

--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -7,52 +7,70 @@ SET(XPU_PROJECT                 "extern_xpu")
 SET(XPU_API_LIB_NAME            "libxpuapi.so")
 SET(XPU_RT_LIB_NAME             "libxpurt.so")
-if(NOT XPU_SDK_ROOT)
+IF(WITH_AARCH64)
-  if (WITH_AARCH64)
+  SET(XPU_XRE_DIR_NAME "xre-kylin_aarch64")
-      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/aarch64/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
+  SET(XPU_XDNN_DIR_NAME "xdnn-kylin_aarch64")
-  elseif(WITH_SUNWAY)
+  SET(XPU_XCCL_DIR_NAME "xccl-kylin_aarch64")
-      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
+ELSEIF(WITH_SUNWAY)
-  else()
+  SET(XPU_XRE_DIR_NAME "xre-deepin_sw6_64")
-      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_05_19.tar.gz" CACHE STRING "" FORCE)
+  SET(XPU_XDNN_DIR_NAME "xdnn-deepin_sw6_64")
-  endif()
+  SET(XPU_XCCL_DIR_NAME "xccl-deepin_sw6_64")
+ELSEIF(WITH_BDCENTOS)
-  SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
+  SET(XPU_XRE_DIR_NAME "xre-bdcentos_x86_64")
-  SET(XPU_DOWNLOAD_DIR            "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
+  SET(XPU_XDNN_DIR_NAME "xdnn-bdcentos_x86_64")
-  SET(XPU_INSTALL_DIR             "${THIRD_PARTY_PATH}/install/xpu")
+  SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
-  SET(XPU_API_INC_DIR             "${THIRD_PARTY_PATH}/install/xpu/include")
+ELSEIF(WITH_UBUNTU)
-  SET(XPU_LIB_DIR                 "${THIRD_PARTY_PATH}/install/xpu/lib")
+  SET(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64")
+  SET(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64")
-  SET(XPU_API_LIB                 "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
+  SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
-  SET(XPU_RT_LIB                  "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
+ELSEIF(WITH_CENTOS)
+  SET(XPU_XRE_DIR_NAME "xre-centos7_x86_64")
-  SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
+  SET(XPU_XDNN_DIR_NAME "xdnn-centos7_x86_64")
+  SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
-  FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
+ELSE ()
-    "PROJECT(XPU)\n"
+  SET(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64")
-    "cmake_minimum_required(VERSION 3.0)\n"
+  SET(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64")
-    "install(DIRECTORY xpu/include xpu/lib \n"
+  SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
-    "        DESTINATION ${XPU_INSTALL_DIR})\n")
+ENDIF()
-  ExternalProject_Add(
+SET(XPU_BASE_URL "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev/20210527")
-      ${XPU_PROJECT}
+SET(XPU_XRE_URL  "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
-      ${EXTERNAL_PROJECT_LOG_ARGS}
+SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
-      PREFIX                ${XPU_SOURCE_DIR}
+SET(XPU_XCCL_URL "${XPU_BASE_URL}/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
-      DOWNLOAD_DIR          ${XPU_DOWNLOAD_DIR}
+SET(XPU_PACK_DEPENCE_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/pack_paddle_depence.sh" CACHE STRING "" FORCE)
-      DOWNLOAD_COMMAND      wget --no-check-certificate ${XPU_URL} -c -q -O xpu.tar.gz
-                            && tar xvf xpu.tar.gz
+SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
-      DOWNLOAD_NO_PROGRESS  1
+SET(XPU_DOWNLOAD_DIR            "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
-      UPDATE_COMMAND        ""
+SET(XPU_INSTALL_DIR             "${THIRD_PARTY_PATH}/install/xpu")
-      CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
+SET(XPU_INC_DIR                 "${THIRD_PARTY_PATH}/install/xpu/include")
-      CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
+SET(XPU_LIB_DIR                 "${THIRD_PARTY_PATH}/install/xpu/lib")
-  )
-else()
+SET(XPU_API_LIB                 "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
-  SET(XPU_API_INC_DIR   "${XPU_SDK_ROOT}/XTDK/include/")
+SET(XPU_RT_LIB                  "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
-  SET(XPU_API_LIB "${XPU_SDK_ROOT}/XTDK/shlib/libxpuapi.so")
-  SET(XPU_RT_LIB "${XPU_SDK_ROOT}/XTDK/runtime/shlib/libxpurt.so")
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
-  SET(XPU_LIB_DIR "${XPU_SDK_ROOT}/XTDK/shlib/")
-endif()
+FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(XPU)\n"
+  "cmake_minimum_required(VERSION 3.0)\n"
+  "install(DIRECTORY xpu/include xpu/lib \n"
+  "        DESTINATION ${XPU_INSTALL_DIR})\n")
+ExternalProject_Add(
+    ${XPU_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                ${XPU_SOURCE_DIR}
+    DOWNLOAD_DIR          ${XPU_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget ${XPU_PACK_DEPENCE_URL}
+                          && bash pack_paddle_depence.sh ${XPU_XRE_URL} ${XPU_XRE_DIR_NAME} ${XPU_XDNN_URL} ${XPU_XDNN_DIR_NAME} ${XPU_XCCL_URL} ${XPU_XCCL_DIR_NAME}
+    DOWNLOAD_NO_PROGRESS  1
+    UPDATE_COMMAND        ""
+    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
+    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
+)
-INCLUDE_DIRECTORIES(${XPU_API_INC_DIR})
+INCLUDE_DIRECTORIES(${XPU_INC_DIR})
 ADD_LIBRARY(shared_xpuapi SHARED IMPORTED GLOBAL)
 set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}")
@@ -62,7 +80,7 @@ generate_dummy_static_lib(LIB_NAME "xpulib" GENERATOR "xpu.cmake")
 TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB})
-if (WITH_XPU_BKCL)
+IF(WITH_XPU_BKCL)
  MESSAGE(STATUS "Compile with XPU BKCL!")
  ADD_DEFINITIONS(-DPADDLE_WITH_XPU_BKCL)
@@ -71,9 +89,9 @@ if (WITH_XPU_BKCL)
  SET(XPU_BKCL_INC_DIR          "${THIRD_PARTY_PATH}/install/xpu/include")
  INCLUDE_DIRECTORIES(${XPU_BKCL_INC_DIR})
  TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_BKCL_LIB})
-else(WITH_XPU_BKCL)
+ELSE(WITH_XPU_BKCL)
-  TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} )
+  TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB})
-endif(WITH_XPU_BKCL)
+ENDIF(WITH_XPU_BKCL)
 if(NOT XPU_SDK_ROOT)
  ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})

--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -215,6 +215,8 @@ list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boos
 list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool)
 include(cblas)              	# find first, then download, build, install openblas
+message(STATUS "CBLAS_PROVIDER: ${CBLAS_PROVIDER}")
 if(${CBLAS_PROVIDER} STREQUAL MKLML)
    list(APPEND third_party_deps extern_mklml)
 elseif(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS)

--- a/paddle/fluid/framework/details/nan_inf_utils.h
+++ b/paddle/fluid/framework/details/nan_inf_utils.h
@@ -19,6 +19,7 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/platform/place.h"
 namespace paddle {
@@ -30,9 +31,28 @@ void CheckVarHasNanOrInf(const std::string& op_type,
                         const std::string& var_name,
                         const platform::Place& place);
+void CheckVarHasNanOrInf(const std::string& op_type,
+                         const std::string& var_name,
+                         const framework::Variable* var,
+                         const platform::Place& place);
 void CheckOpHasNanOrInf(const framework::OperatorBase& op,
                        const framework::Scope& scope,
                        const platform::Place& place);
+template <typename VarType>
+void CheckOpHasNanOrInfInDygraph(const std::string& op_type,
+                                 const imperative::NameVarMap<VarType>& op_outs,
+                                 platform::Place place) {
+  for (const auto& pair : op_outs) {
+    for (const auto& ivar : pair.second) {
+      auto* var = ivar->MutableVar();
+      if (var == nullptr) continue;
+      CheckVarHasNanOrInf(op_type, ivar->Name(), var, place);
+    }
+  }
+}
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -297,13 +297,12 @@ void tensor_check<platform::CPUDeviceContext>(const std::string& op_type,
 }
 void CheckVarHasNanOrInf(const std::string& op_type,
-                         const framework::Scope& scope,
                         const std::string& var_name,
+                         const framework::Variable* var,
                         const platform::Place& place) {
-  auto* var = scope.FindVar(var_name);
  PADDLE_ENFORCE_NOT_NULL(
-      var, platform::errors::NotFound("In op=%s, can't find var:%s", op_type,
+      var, platform::errors::NotFound("Cannot find var: `%s` in op `%s`.",
-                                      var_name));
+                                      var_name, op_type));
  const Tensor* tensor{nullptr};
  if (var->IsType<framework::LoDTensor>()) {
@@ -393,6 +392,14 @@ void CheckVarHasNanOrInf(const std::string& op_type,
  tensor_check<platform::CPUDeviceContext>(op_type, var_name, *tensor, place);
 }
+void CheckVarHasNanOrInf(const std::string& op_type,
+                         const framework::Scope& scope,
+                         const std::string& var_name,
+                         const platform::Place& place) {
+  auto* var = scope.FindVar(var_name);
+  CheckVarHasNanOrInf(op_type, var_name, var, place);
+}
 bool IsSkipOp(const framework::OperatorBase& op) {
  if (op_type_nan_inf_white_list().count(op.Type()) != 0) return true;

--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -176,6 +176,7 @@ message DistributedStrategy {
  optional bool find_unused_parameters = 28 [ default = false ];
  optional bool tensor_parallel = 29 [ default = false ];
  optional bool without_graph_optimization = 30 [ default = false ];
+  optional int32 fuse_grad_size_in_num = 31 [ default = 1 ];
  optional RecomputeConfig recompute_configs = 101;
  optional AMPConfig amp_configs = 102;

--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -14,7 +14,6 @@
 #include "paddle/fluid/framework/ir/fc_fuse_pass.h"
 #include <string>
-#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -38,6 +37,7 @@ FCFusePass::FCFusePass() {
      .IsNumGE(1)
      .End()
      .AddAttr("y_num_col_dims")
+      .IsNumEQ(1)
      .End();
  AddOpCompat(OpCompat("elementwise_add"))
@@ -51,6 +51,7 @@ FCFusePass::FCFusePass() {
      .IsTensor()
      .End()
      .AddAttr("axis")
+      .IsNumGE(1)
      .End();
  AddOpCompat(OpCompat("relu"))

--- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
@@ -58,12 +58,12 @@ TEST(FCFusePass, basic) {
  auto* weights_0 = layers.data("weights_0", {}, true);
  auto* mul_out_0 = layers.mul(relu_out_0, weights_0);
  auto* bias_1 = layers.data("bias_1", {}, true);
-  auto* add_out_0 = layers.elementwise_add(mul_out_0, bias_1);
+  auto* add_out_0 = layers.elementwise_add(mul_out_0, bias_1, nullptr, 1);
  auto* relu_out_1 = layers.relu(add_out_0);
  auto* weights_1 = layers.data("weights_1", {}, true);
  auto* mul_out_1 = layers.mul(relu_out_1, weights_1);
  auto* bias_2 = layers.data("bias_2", {}, true);
-  auto* add_out_1 = layers.elementwise_add(mul_out_1, bias_2);
+  auto* add_out_1 = layers.elementwise_add(mul_out_1, bias_2, nullptr, 1);
  VLOG(4) << add_out_1;
  std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));

--- a/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
@@ -250,6 +250,32 @@ OpCompat& OpCompatSensiblePass::AddOpCompat(OpCompat&& op_compat) {
  return *(op_compat_judgers_[name]);
 }
+//! Tell the Op compability of a subgraph.
+bool OpCompatSensiblePass::IsCompat(
+    const GraphPatternDetector::subgraph_t& subgraph, Graph*) const {
+  PADDLE_ENFORCE_EQ(op_compat_judgers_.empty(), false,
+                    platform::errors::InvalidArgument(
+                        "At least one OpCompat instance should be added"));
+  // Check the all the ops in the subgraph are contained in the
+  // op_compat.
+  for (auto& node_pair : subgraph) {
+    if (!node_pair.second->IsOp()) continue;
+    auto op_type = node_pair.second->Op()->Type();
+    if (!op_compat_judgers_.count(op_type)) {
+      if (HasOpDef(op_type)) {
+        LOG(WARNING) << op_type << "compat not registered!";
+        return false;
+      }
+      continue;
+    }
+    auto& judger = *op_compat_judgers_.at(op_type);
+    if (!judger.Judge(*(node_pair.second->Op()))) {
+      return false;
+    }
+  }
+  return true;
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/op_compat_sensible_pass.h
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.h
@@ -195,26 +195,7 @@ class OpCompatSensiblePass : public Pass {
  //! Tell the Op compability of a subgraph.
  bool IsCompat(const GraphPatternDetector::subgraph_t& subgraph,
-                Graph* g) const {
+                Graph* g) const;
-    CHECK(!op_compat_judgers_.empty())
-        << "At least one OpCompat instance should be added in the "
-           "OpCompatSensiblePass.";
-    // Check the all the ops in the subgraph are contained in the
-    // op_compat.
-    for (auto& node_pair : subgraph) {
-      if (!node_pair.second->IsOp()) continue;
-      auto op_type = node_pair.second->Op()->Type();
-      if (!op_compat_judgers_.count(op_type)) {
-        LOG(WARNING) << op_type << "compat not registered!";
-        return false;
-      }
-      auto& judger = *op_compat_judgers_.at(op_type);
-      if (!judger.Judge(*(node_pair.second->Op()))) {
-        return false;
-      }
-    }
-    return true;
-  }
  //! Tell the op compatibility of a single Op.
  bool IsCompat(const OpDesc& op_desc) const {

--- a/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
@@ -151,6 +151,10 @@ class OpCompatSensiblePassTest : public OpCompatSensiblePass {
 public:
  OpCompatSensiblePassTest();
  bool TestIsCompat(const OpDesc& op_desc) { return IsCompat(op_desc); }
+  bool TestIsCompat(const GraphPatternDetector::subgraph_t& subgraph,
+                    Graph* g) {
+    return IsCompat(subgraph, g);
+  }
 };
 OpCompatSensiblePassTest::OpCompatSensiblePassTest() {
@@ -192,6 +196,23 @@ TEST(OpCompatSensiblePass, IsCompat) {
  EXPECT_TRUE(test.TestIsCompat(fc_op));
 }
+TEST(OpCompatSensiblePass, IsCompatFail) {
+  OpCompatSensiblePassTest test;
+  GraphPatternDetector::subgraph_t subgraph;
+  PDPattern pattern;
+  PDNode* pd_node = pattern.NewNode();
+  ProgramDesc prog;
+  Graph g(prog);
+  OpDesc fc_op;
+  fc_op.SetType("op1");
+  subgraph[pd_node] = g.CreateOpNode(&fc_op);
+  EXPECT_TRUE(test.TestIsCompat(subgraph, &g));
+  fc_op.SetType("mul");
+  subgraph[pd_node] = g.CreateOpNode(&fc_op);
+  EXPECT_FALSE(test.TestIsCompat(subgraph, &g));
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/pass_tester_helper.h
+++ b/paddle/fluid/framework/ir/pass_tester_helper.h
@@ -194,14 +194,18 @@ struct Layers {
  }
  VarDesc* mul(VarDesc* x, VarDesc* y, VarDesc* out = nullptr,
-               int x_num_col_dims = 1) {
+               int x_num_col_dims = 1, int y_num_col_dims = 1) {
    AttributeMap attrs;
-    attrs["x_num_col_dims"] = 1;
+    attrs["x_num_col_dims"] = x_num_col_dims;
+    attrs["y_num_col_dims"] = y_num_col_dims;
    return binary_op("mul", x, y, out, &attrs);
  }
-  VarDesc* elementwise_add(VarDesc* x, VarDesc* y, VarDesc* out = nullptr) {
+  VarDesc* elementwise_add(VarDesc* x, VarDesc* y, VarDesc* out = nullptr,
-    return binary_op("elementwise_add", x, y, out);
+                           int axis = -1) {
+    AttributeMap attrs;
+    attrs["axis"] = axis;
+    return binary_op("elementwise_add", x, y, out, &attrs);
  }
  VarDesc* elementwise_mul(VarDesc* x, VarDesc* y, VarDesc* out = nullptr,

--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
@@ -66,9 +66,13 @@ static bool IsFCWithPaddingWeights(Node* n) {
 }
 static bool IsParamOfFC(Node* n, const std::string& param_name) {
-  if (IsInputOfFC(n) && n->inputs.empty() &&
+  if (IsInputOfFC(n) && n->inputs.empty()) {
-      (n->Name() == n->outputs[0]->Op()->Input(param_name)[0])) {
+    for (auto* out : n->outputs) {
-    return true;
+      if (out->Op()->Type() == "fc" &&
+          n->Name() == out->Op()->Input(param_name)[0]) {
+        return true;
+      }
+    }
  }
  return false;
 }

--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -276,7 +276,7 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor) {
  SerializeToStream(os, tensor, *dev_ctx);
 }
-void DeserializeFromStream(std::ifstream &os, LoDTensor *tensor) {
+void DeserializeFromStream(std::istream &os, LoDTensor *tensor) {
  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
  const platform::DeviceContext *dev_ctx;
  dev_ctx = pool.Get(platform::CPUPlace());

--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -257,7 +257,7 @@ LoD ConvertToOffsetBasedLoD(const LoD& length_lod);
 void SerializeToStream(std::ostream& os, const LoDTensor& tensor);
-void DeserializeFromStream(std::ifstream& os, LoDTensor* tensor);
+void DeserializeFromStream(std::istream& os, LoDTensor* tensor);
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/op_def_api.cc
+++ b/paddle/fluid/framework/op_def_api.cc
@@ -68,5 +68,9 @@ const proto::OpDef& GetOpDef(const std::string& op_name) {
  }
  return ops_definition.at(op_name);
 }
+bool HasOpDef(const std::string& op_name) {
+  return op_def_map.find(op_name) != op_def_map.end();
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/op_def_api.h
+++ b/paddle/fluid/framework/op_def_api.h
@@ -19,5 +19,7 @@
 namespace paddle {
 namespace framework {
 const proto::OpDef& GetOpDef(const std::string& op_name);
+bool HasOpDef(const std::string& op_name);
 }
 }
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -121,7 +121,7 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows) {
  SerializeToStream(os, selected_rows, *dev_ctx);
 }
-void DeserializeFromStream(std::ifstream& os, SelectedRows* selected_rows) {
+void DeserializeFromStream(std::istream& os, SelectedRows* selected_rows) {
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
  const platform::DeviceContext* dev_ctx;
  dev_ctx = pool.Get(platform::CPUPlace());

--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -175,7 +175,7 @@ void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
 void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows);
-void DeserializeFromStream(std::ifstream& os, SelectedRows* selected_rows);
+void DeserializeFromStream(std::istream& os, SelectedRows* selected_rows);
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
 cc_library(imperative_flag SRCS flags.cc DEPS gflags)
-cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform)
+cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils)
 cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
 add_subdirectory(jit)
 cc_library(amp SRCS amp_auto_cast.cc DEPS layer )

--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -15,8 +15,11 @@
 #include "paddle/fluid/imperative/prepared_operator.h"
 #include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/imperative/infer_shape_context.h"
+DECLARE_bool(check_nan_inf);
 namespace paddle {
 namespace imperative {
@@ -175,6 +178,11 @@ static void PreparedOpRunImpl(
  func(DygraphExecutionContext<VarType>(op, scope, *dev_ctx, ctx, ins, outs,
                                        attrs));
+  if (FLAGS_check_nan_inf) {
+    framework::details::CheckOpHasNanOrInfInDygraph<VarType>(
+        op.Type(), outs, dev_ctx->GetPlace());
+  }
  /**
   * [ Why need handle complex gradient to real gradient? ]
   *

--- a/paddle/fluid/imperative/py_layer_fwd.h
+++ b/paddle/fluid/imperative/py_layer_fwd.h
@@ -17,6 +17,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/imperative/prepared_operator.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -32,7 +33,17 @@ bool RequiredGrad(const NameVarBaseMap& ins, const NameVarBaseMap& outs) {
  for (const auto& name_pair : ins) {
    for (const auto& var_base : name_pair.second) {
      if (!var_base->OverridedStopGradient()) {
-        PassStopGradient(outs, var_base->OverridedStopGradient());
+        for (const auto& pair : outs) {
+          for (const auto& var : pair.second) {
+            if (var) {
+              var->SetOverridedStopGradient(false);
+              SetForwardDataTypeOfGradVar(var);
+              VLOG(3) << "Set output: " << var->Name()
+                      << "'s OverridedStopGradient as "
+                      << var->OverridedStopGradient();
+            }
+          }
+        }
        return true;
      }
    }
@@ -78,28 +89,36 @@ py::object PyLayerApply(const platform::Place& place, const py::handle& cls,
  // process args,`input_vars` only collect `imperative::VarBase`
  if (!args.empty()) {
    for (auto ptr = args.begin(); ptr != args.end(); ptr++) {
-      try {
+      // Only collect Tensor type in 'args' and pass them to backward. Ignore
-        if (Py_None != ptr->ptr()) {
+      // other types of input temporarily.
+      if (py::isinstance<imperative::VarBase>(*ptr)) {
+        try {
          auto a = ptr->cast<std::shared_ptr<VarBase>>();
          input_vars.push_back(a);
+        } catch (py::cast_error& err) {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "The `PyLayer.forward` function contains invalid argument, the "
+              "`%s` type argument can not be cast into `Tensor`.",
+              ptr->ptr()->ob_type->tp_name));
        }
-      } catch (py::cast_error& err) {
-        // Only collect Tensor type in 'args' and pass them to backward. Ignore
-        // other types of input temporarily.
      }
    }
  }
  // process kwargs, only collect `imperative::VarBase`
  if (!kwargs.empty()) {
    for (auto ptr = kwargs.begin(); ptr != kwargs.end(); ptr++) {
-      try {
+      // Only collect Tensor type in 'kwargs' and pass them to backward.
-        if (Py_None != ptr->second.ptr()) {
+      // Ignore other types of input temporarily.
+      if (py::isinstance<imperative::VarBase>(*ptr->second)) {
+        try {
          auto a = ptr->second.cast<std::shared_ptr<VarBase>>();
          input_vars.push_back(a);
+        } catch (py::cast_error&) {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "The `PyLayer.forward` function contains invalid argument, the "
+              "`%s` type argument can not be cast into `Tensor`.",
+              ptr->second.ptr()->ob_type->tp_name));
        }
-      } catch (py::cast_error&) {
-        // Only collect Tensor type in 'kwargs' and pass them to backward.
-        // Ignore other types of input temporarily.
      }
    }
  }
@@ -110,33 +129,35 @@ py::object PyLayerApply(const platform::Place& place, const py::handle& cls,
      PyList_Check(result_forward.ptr())) {
    auto tuple_result = result_forward.cast<py::tuple>();
    for (size_t i = 0; i < tuple_result.size(); i++) {
-      if (Py_None != tuple_result[i].ptr()) {
+      // Only collect Tensor type of output and pass them to backward.
+      // Ignore other types of input temporarily.
+      if (py::isinstance<imperative::VarBase>(tuple_result[i])) {
        try {
          auto temp_out =
              tuple_result[i].cast<std::shared_ptr<imperative::VarBase>>();
          output_vars.push_back(temp_out);
        } catch (py::cast_error&) {
-          // Only collect Tensor type in 'kwargs' and pass them to backward.
+          PADDLE_THROW(platform::errors::InvalidArgument(
-          // Ignore other types of input temporarily.
+              "The `PyLayer.forward` function returns invalid argument, the "
+              "`%s` type argument can not be cast into `Tensor`.",
+              tuple_result[i].ptr()->ob_type->tp_name));
        }
-      } else {
-        // Only collect Tensor type in 'kwargs' and pass them to backward.
-        // Ignore other types of input temporarily.
      }
    }
  } else {
-    if (Py_None != result_forward.ptr()) {
+    // Only collect Tensor type of output and pass them to backward.
+    // Ignore other types of input temporarily.
+    if (py::isinstance<imperative::VarBase>(result_forward)) {
      try {
        auto temp_out =
            result_forward.cast<std::shared_ptr<imperative::VarBase>>();
        output_vars.push_back(temp_out);
      } catch (py::cast_error&) {
-        // Only collect Tensor type in 'kwargs' and pass them to backward.
+        PADDLE_THROW(platform::errors::InvalidArgument(
-        // Ignore other types of input temporarily.
+            "The `PyLayer.forward` function returns invalid argument, the `%s` "
+            "type argument can not be cast into `Tensor`.",
+            result_forward.ptr()->ob_type->tp_name));
      }
-    } else {
-      // Only collect Tensor type in 'kwargs' and pass them to backward.
-      // Ignore other types of input temporarily.
    }
  }
  if (output_vars.size() == 0) {

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -303,7 +303,9 @@ static void DisablePrepareDataOpt(
                            disable_opt || pre_disable_opt);
    }
    // disable prepare data if unfriendly op is found
-    disable_opt = IsPrepareDataOptTargetOp(op);
+    if (!disable_opt) {
+      disable_opt = IsPrepareDataOptTargetOp(op);
+    }
  }
 }

--- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
@@ -46,13 +46,6 @@ class LayerNormOpConverter : public OpConverter {
    auto* Bias_t = Bias_v->GetMutable<framework::LoDTensor>();
    auto* Scale_t = Scale_v->GetMutable<framework::LoDTensor>();
-    int input_num = 1;
-    for (int i = 0; i < X->getDimensions().nbDims; i++) {
-      input_num *= X->getDimensions().d[i];
-    }
-    std::vector<int64_t> mean_shape{input_num};
-    std::vector<int64_t> variance_shape{input_num};
    std::unique_ptr<framework::LoDTensor> bias_tensor(
        new framework::LoDTensor());
    std::unique_ptr<framework::LoDTensor> scale_tensor(
@@ -68,10 +61,33 @@ class LayerNormOpConverter : public OpConverter {
    auto* bias_data = bias_tensor->mutable_data<float>(platform::CPUPlace());
    auto* scale_data = scale_tensor->mutable_data<float>(platform::CPUPlace());
-    plugin::LayerNormPlugin* plugin = new plugin::LayerNormPlugin(
+    nvinfer1::ILayer* layernorm_layer = nullptr;
-        bias_data, bias_tensor->numel(), scale_data, scale_tensor->numel(),
+    if (engine_->with_dynamic_shape()) {
-        begin_norm_axis, eps, mean_shape, variance_shape);
+      int input_num = 1;
-    nvinfer1::IPluginLayer* layernorm_layer = engine_->AddPlugin(&X, 1, plugin);
+      for (int i = begin_norm_axis; i < X->getDimensions().nbDims; i++) {
+        input_num *= X->getDimensions().d[i];
+      }
+      std::vector<int64_t> mean_shape{input_num};
+      std::vector<int64_t> variance_shape{input_num};
+      plugin::LayerNormPluginDynamic* plugin =
+          new plugin::LayerNormPluginDynamic(bias_data, bias_tensor->numel(),
+                                             scale_data, scale_tensor->numel(),
+                                             begin_norm_axis, eps, mean_shape,
+                                             variance_shape);
+      layernorm_layer = engine_->AddDynamicPlugin(&X, 1, plugin);
+    } else {
+      int input_num = 1;
+      for (int i = begin_norm_axis - 1; i < X->getDimensions().nbDims; i++) {
+        input_num *= X->getDimensions().d[i];
+      }
+      std::vector<int64_t> mean_shape{input_num};
+      std::vector<int64_t> variance_shape{input_num};
+      plugin::LayerNormPlugin* plugin = new plugin::LayerNormPlugin(
+          bias_data, bias_tensor->numel(), scale_data, scale_tensor->numel(),
+          begin_norm_axis, eps, mean_shape, variance_shape);
+      layernorm_layer = engine_->AddPlugin(
+          &X, 1, reinterpret_cast<plugin::PluginTensorRT*>(plugin));
+    }
    auto output_name = op_desc.Output("Y").front();
    engine_->SetWeights(op_desc.Input("Bias").front(), std::move(bias_tensor));

--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -703,7 +703,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
        return false;
        // Paddle-TRT does not support the input tensors: Shape and ShapeTensor
      } else if (desc.Input("Shape").size() >= 1 ||
-                 desc.Input("ShapeTensor").size() >= 1) {
+                 desc.Input("ShapeTensor").size() >= 1 || with_dynamic_shape) {
        return false;
      } else {
        std::vector<int> shape =

--- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
@@ -57,8 +57,18 @@ int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs,
    input_shape.push_back(input_dims.d[i]);
  }
  const auto input_ddim = framework::make_ddim(input_shape);
-  auto matrix_dim = framework::flatten_to_2d(input_ddim, begin_norm_axis - 1);
+  auto matrix_dim = framework::flatten_to_2d(input_ddim, begin_norm_axis);
  int feature_size = static_cast<int>(matrix_dim[1]);
+  PADDLE_ENFORCE_EQ(feature_size, scale_.size(),
+                    platform::errors::InvalidArgument(
+                        "scale's size should be equal to the feature_size,"
+                        "but got feature_size:%d, scale's size:%d.",
+                        feature_size, scale_.size()));
+  PADDLE_ENFORCE_EQ(feature_size, bias_.size(),
+                    platform::errors::InvalidArgument(
+                        "bias's size should be equal to the feature_size,"
+                        "but got feature_size:%d, bias's size:%d.",
+                        feature_size, bias_.size()));
  scale_t.Resize(framework::make_ddim({feature_size}));
  bias_t.Resize(framework::make_ddim({feature_size}));
@@ -82,6 +92,163 @@ int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs,
  return cudaGetLastError() != cudaSuccess;
 }
+nvinfer1::DimsExprs LayerNormPluginDynamic::getOutputDimensions(
+    int output_index, const nvinfer1::DimsExprs *inputDims, int nb_inputs,
+    nvinfer1::IExprBuilder &expr_builder) {
+  return inputDims[0];
+}
+bool LayerNormPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
+    int nb_outputs) {
+  PADDLE_ENFORCE_NOT_NULL(
+      in_out, platform::errors::InvalidArgument(
+                  "The input of layernorm plugin shoule not be nullptr."));
+  PADDLE_ENFORCE_LT(
+      pos, nb_inputs + nb_outputs,
+      platform::errors::InvalidArgument("The pos(%d) should be less than the "
+                                        "num(%d) of the input and the output.",
+                                        pos, nb_inputs + nb_outputs));
+  const nvinfer1::PluginTensorDesc &in = in_out[pos];
+  if (pos == 0) {
+    // TODO(Shangzhizhou) FP16 support
+    return (in.type == nvinfer1::DataType::kFLOAT) &&
+           (in.format == nvinfer1::TensorFormat::kLINEAR);
+  }
+  const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1];
+  // output
+  return in.type == prev.type && in.format == prev.format;
+}
+nvinfer1::DataType LayerNormPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The LayerNormPlugin only has one input, so the "
+                        "index value should be 0, but get %d.",
+                        index));
+  return input_types[0];
+}
+int LayerNormPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc *input_desc,
+    const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
+    void *const *outputs, void *workspace, cudaStream_t stream) {
+  const auto &input_dims = input_desc[0].dims;
+  int begin_norm_axis = begin_norm_axis_;
+  float eps = eps_;
+  std::vector<int> input_shape;
+  for (int i = 0; i < input_dims.nbDims; i++) {
+    input_shape.push_back(input_dims.d[i]);
+  }
+  const auto input_ddim = framework::make_ddim(input_shape);
+  auto matrix_dim = framework::flatten_to_2d(input_ddim, begin_norm_axis);
+  int feature_size = static_cast<int>(matrix_dim[1]);
+  PADDLE_ENFORCE_EQ(feature_size, scale_.size(),
+                    platform::errors::InvalidArgument(
+                        "scale's size should be equal to the feature_size,"
+                        "but got feature_size:%d, scale's size:%d.",
+                        feature_size, scale_.size()));
+  PADDLE_ENFORCE_EQ(feature_size, bias_.size(),
+                    platform::errors::InvalidArgument(
+                        "bias's size should be equal to the feature_size,"
+                        "but got feature_size:%d, bias's size:%d.",
+                        feature_size, bias_.size()));
+  int device_id;
+  cudaGetDevice(&device_id);
+  auto input_type = input_desc[0].type;
+  if (input_type == nvinfer1::DataType::kFLOAT) {
+    VLOG(1) << "TRT Plugin DataType selected. LayerNorm-->fp32";
+    const float *input = reinterpret_cast<const float *>(inputs[0]);
+    float *output = static_cast<float *>(outputs[0]);
+    scale_t.Resize(framework::make_ddim({feature_size}));
+    bias_t.Resize(framework::make_ddim({feature_size}));
+    mean_t.Resize(framework::make_ddim(mean_shape_));
+    variance_t.Resize(framework::make_ddim(variance_shape_));
+    float *scale_d =
+        scale_t.mutable_data<float>(platform::CUDAPlace(device_id));
+    float *bias_d = bias_t.mutable_data<float>(platform::CUDAPlace(device_id));
+    float *mean_d = mean_t.mutable_data<float>(platform::CUDAPlace(device_id));
+    float *variance_d =
+        variance_t.mutable_data<float>(platform::CUDAPlace(device_id));
+    cudaMemcpyAsync(scale_d, scale_.data(), sizeof(float) * feature_size,
+                    cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(bias_d, bias_.data(), sizeof(float) * feature_size,
+                    cudaMemcpyHostToDevice, stream);
+    paddle::operators::LayerNormDirectCUDAFunctor<float> layer_norm;
+    layer_norm(stream, input, input_shape, bias_d, scale_d, output, mean_d,
+               variance_d, begin_norm_axis, eps);
+  } else if (input_type == nvinfer1::DataType::kHALF) {
+#ifdef TRT_PLUGIN_FP16_AVALIABLE
+    VLOG(1) << "TRT Plugin DataType selected. LayerNorm-->fp16";
+    const half *input = reinterpret_cast<const half *>(inputs[0]);
+    half *output = static_cast<half *>(outputs[0]);
+    size_t mean_shape_product = 1;
+    for (auto s : mean_shape_) {
+      mean_shape_product *= s;
+    }
+    size_t variance_shape_product = 1;
+    for (auto s : variance_shape_) {
+      variance_shape_product *= s;
+    }
+    if (!scale_gpu_half_d_) {
+      cudaMalloc(&scale_gpu_half_d_, feature_size * sizeof(half));
+    }
+    if (!bias_gpu_half_d_) {
+      cudaMalloc(&bias_gpu_half_d_, feature_size * sizeof(half));
+    }
+    if (!mean_gpu_half_d_) {
+      cudaMalloc(&mean_gpu_half_d_, mean_shape_product * sizeof(half));
+    }
+    if (!variance_gpu_half_d_) {
+      cudaMalloc(&variance_gpu_half_d_, variance_shape_product * sizeof(half));
+    }
+    half *scale_cpu_half =
+        static_cast<half *>(malloc(feature_size * sizeof(half)));
+    half *bias_cpu_half =
+        static_cast<half *>(malloc(feature_size * sizeof(half)));
+    PADDLE_ENFORCE_EQ(
+        scale_cpu_half && bias_cpu_half, true,
+        platform::errors::Unavailable("Out of memory, malloc size %d.",
+                                      feature_size * sizeof(half)));
+    for (int i = 0; i < feature_size; i++) {
+      scale_cpu_half[i] = static_cast<half>(scale_[i]);
+      bias_cpu_half[i] = static_cast<half>(bias_[i]);
+    }
+    cudaMemcpyAsync(scale_gpu_half_d_, scale_cpu_half,
+                    sizeof(half) * feature_size, cudaMemcpyHostToDevice,
+                    stream);
+    cudaMemcpyAsync(bias_gpu_half_d_, bias_cpu_half,
+                    sizeof(half) * feature_size, cudaMemcpyHostToDevice,
+                    stream);
+    free(scale_cpu_half);
+    free(bias_cpu_half);
+    paddle::operators::LayerNormDirectCUDAFunctor<half> layer_norm;
+    layer_norm(stream, input, input_shape, bias_gpu_half_d_, scale_gpu_half_d_,
+               output, mean_gpu_half_d_, variance_gpu_half_d_, begin_norm_axis,
+               eps);
+#else
+    PADDLE_THROW(platform::errors::Fatal(
+        "The layer_norm tensorRT plugin should be "
+        "complied with CUDA version >= 10.0 when running with fp16. "
+        "Please recomplie it or try to use fp32 by set "
+        "config.SetTRTDynamicShapeInfo(min_input_shape, "
+        "max_input_shape, opt_input_shape, true"));
+#endif
+  } else {
+    PADDLE_THROW(platform::errors::Fatal(
+        "The LayerNorm TRT Plugin's input type should be float or half."));
+  }
+  return cudaGetLastError() != cudaSuccess;
+}
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference

--- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
@@ -50,7 +50,7 @@ class LayerNormPlugin : public PluginTensorRT {
  // TRT will call this func when we need to serialize the configuration of
  // tensorrt.
  // It should not be called by users.
-  void serialize(void *buffer) override {
+  void serialize(void* buffer) override {
    SerializeValue(&buffer, getPluginType());
    serializeBase(buffer);
    SerializeValue(&buffer, bias_);
@@ -62,7 +62,7 @@ class LayerNormPlugin : public PluginTensorRT {
  }
 public:
-  LayerNormPlugin(const float *bias, const int bias_num, const float *scale,
+  LayerNormPlugin(const float* bias, const int bias_num, const float* scale,
                  const int scale_num, int begin_norm_axis, float eps,
                  std::vector<int64_t> mean_shape,
                  std::vector<int64_t> variance_shape)
@@ -78,7 +78,7 @@ class LayerNormPlugin : public PluginTensorRT {
  // It was used for tensorrt deserialization.
  // It should not be called by users.
-  LayerNormPlugin(void const *serialData, size_t serialLength) {
+  LayerNormPlugin(void const* serialData, size_t serialLength) {
    deserializeBase(serialData, serialLength);
    DeserializeValue(&serialData, &serialLength, &bias_);
    DeserializeValue(&serialData, &serialLength, &scale_);
@@ -90,20 +90,180 @@ class LayerNormPlugin : public PluginTensorRT {
  ~LayerNormPlugin() {}
  int initialize() override;
-  LayerNormPlugin *clone() const override {
+  LayerNormPlugin* clone() const override {
    return new LayerNormPlugin(bias_.data(), bias_.size(), scale_.data(),
                               scale_.size(), begin_norm_axis_, eps_,
                               mean_shape_, variance_shape_);
  }
-  const char *getPluginType() const override { return "layer_norm_plugin"; }
+  const char* getPluginType() const override { return "layer_norm_plugin"; }
  int getNbOutputs() const override { return 1; }
-  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
                                     int nbInputDims) override;
-  int enqueue(int batchSize, const void *const *inputs, void **outputs,
+  int enqueue(int batchSize, const void* const* inputs, void** outputs,
-              void *workspace, cudaStream_t stream) override;
+              void* workspace, cudaStream_t stream) override;
 };
+class LayerNormPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  LayerNormPluginDynamic(const float* bias, const int bias_num,
+                         const float* scale, const int scale_num,
+                         int begin_norm_axis, float eps,
+                         std::vector<int64_t> mean_shape,
+                         std::vector<int64_t> variance_shape)
+      : begin_norm_axis_(begin_norm_axis),
+        eps_(eps),
+        mean_shape_(mean_shape),
+        variance_shape_(variance_shape),
+        scale_gpu_half_d_(nullptr),
+        bias_gpu_half_d_(nullptr),
+        mean_gpu_half_d_(nullptr),
+        variance_gpu_half_d_(nullptr) {
+    bias_.resize(bias_num);
+    scale_.resize(scale_num);
+    std::copy(bias, bias + bias_num, bias_.data());
+    std::copy(scale, scale + scale_num, scale_.data());
+  }
+  LayerNormPluginDynamic(void const* serialData, size_t serialLength)
+      : scale_gpu_half_d_(nullptr),
+        bias_gpu_half_d_(nullptr),
+        mean_gpu_half_d_(nullptr),
+        variance_gpu_half_d_(nullptr) {
+    DeserializeValue(&serialData, &serialLength, &bias_);
+    DeserializeValue(&serialData, &serialLength, &scale_);
+    DeserializeValue(&serialData, &serialLength, &begin_norm_axis_);
+    DeserializeValue(&serialData, &serialLength, &eps_);
+    DeserializeValue(&serialData, &serialLength, &mean_shape_);
+    DeserializeValue(&serialData, &serialLength, &variance_shape_);
+  }
+  nvinfer1::IPluginV2DynamicExt* clone() const override {
+    return new LayerNormPluginDynamic(bias_.data(), bias_.size(), scale_.data(),
+                                      scale_.size(), begin_norm_axis_, eps_,
+                                      mean_shape_, variance_shape_);
+  }
+  const char* getPluginType() const override { return "layernorm_plugin"; }
+  int getNbOutputs() const override { return 1; }
+  int initialize() override { return 0; }
+  size_t getSerializationSize() const override {
+    return SerializedSize(bias_) + SerializedSize(scale_) +
+           SerializedSize(begin_norm_axis_) + SerializedSize(eps_) +
+           SerializedSize(mean_shape_) + SerializedSize(variance_shape_);
+  }
+  void serialize(void* buffer) const override {
+    SerializeValue(&buffer, bias_);
+    SerializeValue(&buffer, scale_);
+    SerializeValue(&buffer, begin_norm_axis_);
+    SerializeValue(&buffer, eps_);
+    SerializeValue(&buffer, mean_shape_);
+    SerializeValue(&buffer, variance_shape_);
+  }
+  nvinfer1::DimsExprs getOutputDimensions(
+      int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
+      nvinfer1::IExprBuilder& expr_builder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) override {}
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const override {
+    return 0;
+  }
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) override;
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const override;
+  ~LayerNormPluginDynamic() {
+    if (scale_gpu_half_d_) {
+      cudaFree(scale_gpu_half_d_);
+    }
+    if (bias_gpu_half_d_) {
+      cudaFree(bias_gpu_half_d_);
+    }
+    if (mean_gpu_half_d_) {
+      cudaFree(mean_gpu_half_d_);
+    }
+    if (variance_gpu_half_d_) {
+      cudaFree(variance_gpu_half_d_);
+    }
+  }
+  void destroy() override { delete this; }
+ private:
+  std::vector<float> bias_;
+  std::vector<float> scale_;
+  framework::Tensor scale_t;
+  framework::Tensor bias_t;
+  framework::Tensor mean_t;
+  framework::Tensor variance_t;
+  int begin_norm_axis_;
+  float eps_;
+  std::vector<int64_t> mean_shape_;
+  std::vector<int64_t> variance_shape_;
+  half* scale_gpu_half_d_;
+  half* bias_gpu_half_d_;
+  half* mean_gpu_half_d_;
+  half* variance_gpu_half_d_;
+};
+class LayerNormPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  LayerNormPluginDynamicCreator() {}
+  const char* getPluginName() const override { return "layernorm_plugin"; }
+  const char* getPluginVersion() const override { return "1"; }
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    return nullptr;
+  }
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override {
+    auto plugin = new LayerNormPluginDynamic(serial_data, serial_length);
+    return plugin;
+  }
+  void setPluginNamespace(const char* lib_namespace) override {
+    plugin_namespace_ = lib_namespace;
+  }
+  const char* getPluginNamespace() const override {
+    return plugin_namespace_.c_str();
+  }
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+REGISTER_TRT_PLUGIN_V2(LayerNormPluginDynamicCreator);
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -325,11 +325,10 @@ inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_te
 # densebox
 set(DENSEBOX_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/densebox")
 download_data_without_verify(${DENSEBOX_INSTALL_DIR} "densebox.tar.gz")
-#inference_analysis_test(test_analyzer_detect SRCS analyzer_detect_tester.cc 
+inference_analysis_test(test_analyzer_detect_functional_mkldnn SRCS analyzer_detect_functional_mkldnn_tester.cc 
-#  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-#  ARGS --infer_model=${DENSEBOX_INSTALL_DIR}/model --infer_data=${DENSEBOX_INSTALL_DIR}/detect_input_50.txt 
+  ARGS --infer_model=${DENSEBOX_INSTALL_DIR}/model --infer_data=${DENSEBOX_INSTALL_DIR}/detect_input_50.txt 
-#       --infer_shape=${DENSEBOX_INSTALL_DIR}/shape_50.txt)
+       --infer_shape=${DENSEBOX_INSTALL_DIR}/shape_50.txt)
-#set_property(TEST test_analyzer_detect PROPERTY ENVIRONMENT GLOG_vmodule=analysis_predictor=2)
 # mobilenet with transpose op
 set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")

--- a/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+DEFINE_string(infer_shape, "", "data shape file");
+DEFINE_int32(sample, 20, "number of sample");
+namespace paddle {
+namespace inference {
+namespace analysis {
+struct Record {
+  std::vector<float> data;
+  std::vector<int32_t> shape;
+};
+Record ProcessALine(const std::string &line, const std::string &shape_line) {
+  VLOG(3) << "process a line";
+  Record record;
+  std::vector<std::string> data_strs;
+  split(line, ' ', &data_strs);
+  for (auto &d : data_strs) {
+    record.data.push_back(std::stof(d));
+  }
+  std::vector<std::string> shape_strs;
+  split(shape_line, ' ', &shape_strs);
+  for (auto &s : shape_strs) {
+    record.shape.push_back(std::stoi(s));
+  }
+  return record;
+}
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
+  cfg->DisableGpu();
+  // cfg->SwitchIrDebug(); // Enable to have graphs dumped
+  cfg->SwitchSpecifyInputNames(false);
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_cpu_num_threads);
+}
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
+              const std::string &line, const std::string &shape_line) {
+  auto record = ProcessALine(line, shape_line);
+  PaddleTensor input;
+  input.shape = record.shape;
+  input.dtype = PaddleDType::FLOAT32;
+  size_t input_size = record.data.size() * sizeof(float);
+  input.data.Resize(input_size);
+  memcpy(input.data.data(), record.data.data(), input_size);
+  std::vector<PaddleTensor> input_slots;
+  input_slots.assign({input});
+  (*inputs).emplace_back(input_slots);
+}
+#ifdef PADDLE_WITH_MKLDNN
+int GetNumCachedObjects(void) {
+  auto &pool = platform::DeviceContextPool::Instance();
+  platform::CPUPlace place;
+  auto onednn_dev_ctx =
+      dynamic_cast<platform::MKLDNNDeviceContext *>(pool.Get(place));
+  return onednn_dev_ctx->GetCachedObjectsNumber();
+}
+void validate_cache_onednn(int cache_capacity = 1) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  cfg.EnableMKLDNN();
+  cfg.SetMkldnnCacheCapacity(cache_capacity);
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  std::vector<std::vector<PaddleTensor>> ref_outputs;
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  std::ifstream file(FLAGS_infer_data);
+  std::ifstream infer_file(FLAGS_infer_shape);
+  std::vector<std::string> lines;
+  std::vector<std::string> shape_lines;
+  // Let's work with 4 samples
+  auto num_samples = 4;
+  ref_outputs.resize(num_samples);
+  lines.resize(num_samples);
+  shape_lines.resize(num_samples);
+  // Let's remember number of cached objects before
+  // execution and after every single execution
+  std::vector<int> cache_filling;
+  cache_filling.push_back(GetNumCachedObjects());
+  // compute sequentially prediction
+  for (int i = 0; i < num_samples; ++i) {
+    std::getline(file, lines[i]);
+    std::getline(infer_file, shape_lines[i]);
+    SetInput(&input_slots_all, lines[i], shape_lines[i]);
+    predictor->Run(input_slots_all[i], &ref_outputs[i], FLAGS_batch_size);
+    // record number of cached objects
+    cache_filling.push_back(GetNumCachedObjects());
+  }
+  file.close();
+  infer_file.close();
+  predictor.reset(nullptr);
+  cache_filling.push_back(GetNumCachedObjects());
+  // Compare results
+  // First and last value should be equal e.g. before using cache (empty) and
+  // after releasing executor
+  PADDLE_ENFORCE_EQ(
+      cache_filling[0], cache_filling[cache_filling.size() - 1],
+      platform::errors::Fatal("Cache size before execution and after "
+                              "releasing Executor do not match"));
+  // Iterate to check if cache is not increasing
+  // over exceeding cache capacity
+  if (cache_capacity != 0) {
+    for (int i = cache_capacity + 1; i < num_samples + 1; ++i) {
+      PADDLE_ENFORCE_EQ(
+          cache_filling[cache_capacity], cache_filling[i],
+          platform::errors::Fatal("Cache capacity should not increase "
+                                  "after full capacity is used"));
+    }
+  }
+}
+TEST(Analyzer_detect, validate_cache_onednn) {
+  validate_cache_onednn(2 /*cache_capacity */);
+}
+#endif
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
+++ b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
@@ -167,7 +167,7 @@ def run_convert():
               os.path.getsize(output_file) == FULL_SIZE_BYTES):
        if os.path.exists(output_file):
            sys.stderr.write(
-                "\n\nThe existing binary file is broken. Start to generate new one...\n\n".
+                "\n\nThe existing binary file[{}] is broken. Start to generate new one...\n\n".
                format(output_file))
            os.remove(output_file)
        if retry < try_limit:

--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -120,6 +120,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
                : len;
      }
    } else if (context.Attr<bool>("set_constant")) {
+      // TODO(Liu yuang) ADD NPU SET_CONSTANT FUNCTION.
      math::SetConstant<DeviceContext, T> set_constant;
      set_constant(dev_ctx, fused_tensor,
                   static_cast<T>(context.Attr<float>("constant")));
@@ -145,6 +146,14 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
    offset = 0;
    std::stringstream ss;
    ss << "alloc_space_for_vars: ";
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto stream =
+        context.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    platform::NPUMemsetAsync(
+        static_cast<void *>(fused_tensor->mutable_data<T>(dev_ctx.GetPlace())),
+        0.0, fused_tensor->numel() * sizeof(T), stream);
+#endif
    for (size_t i = 0; i < out_tensors.size(); ++i) {
      size_t len = static_cast<size_t>(out_tensors[i]->numel());
      auto dim = out_tensors[i]->dims();
@@ -160,6 +169,12 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
      ss << "output(" << out_var_names[i] << ")  dim:(" << dim << ")"
         << " address: " << out_tensors[i]->data<void>() << ", ";
    }
+    PADDLE_ENFORCE_EQ(
+        (int64_t)offset, fused_tensor->numel(),
+        platform::errors::InvalidArgument(
+            "The alloc_space_for_vars's offset: %s is unequal with "
+            "fused_tensor's numel: %s.",
+            offset, fused_tensor->numel()));
    VLOG(10) << ss.str();
  }
@@ -191,13 +206,13 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
      ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
         << ") "
         << " addres:" << lod_tensors[i]->data<void>() << ", ";
      *numel += use_align
                    ? platform::Alignment(
                          static_cast<size_t>(size) * size_of_dtype, place) /
                          size_of_dtype
                    : static_cast<size_t>(size);
    }
    VLOG(10) << ss.str();
  }
 };
@@ -309,6 +324,16 @@ REGISTER_OP_XPU_KERNEL(
    ops::CoalesceTensorOpKernel<paddle::platform::XPUDeviceContext, double>);
 #endif
+#if defined(PADDLE_WITH_ASCEND_CL)
+REGISTER_OP_NPU_KERNEL(
+    coalesce_tensor,
+    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext,
+                                plat::float16>,
+    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, double>);
+#endif
 REGISTER_OP_VERSION(coalesce_tensor)
    .AddCheckpoint(
        R"ROC(

--- a/paddle/fluid/operators/collective/barrier_op.cu.cc
+++ b/paddle/fluid/operators/collective/barrier_op.cu.cc
@@ -43,12 +43,10 @@ class BarrierOpCUDAKernel : public framework::OpKernel<T> {
    ncclRedOp_t nccl_red_type = ncclSum;
    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
        sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream));
-    auto comm_stream =
-        platform::NCCLCommContext::Instance().Get(rid, place)->stream();
 #ifdef PADDLE_WITH_RCCL
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(comm_stream));
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(comm_stream));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
 #endif
 #else
    PADDLE_THROW(platform::errors::Unavailable(

--- a/paddle/fluid/operators/collective/c_embedding_op.cc
+++ b/paddle/fluid/operators/collective/c_embedding_op.cc
@@ -31,13 +31,12 @@ class CEmbeddingOp : public framework::OperatorWithKernel {
    int ids_rank = ids_dims.size();
    VLOG(5) << "ids rank is " << ids_rank << std::endl;
-    PADDLE_ENFORCE_EQ(
+    PADDLE_ENFORCE_EQ(table_dims.size(), 2,
-        table_dims.size(), 2,
+                      platform::errors::InvalidArgument(
-        platform::errors::InvalidArgument(
+                          "The dimensions of the 'c_embedding' must be 2. "
-            "ShapeError: The dimensions of the 'c_embedding' must be 2. "
+                          "But received c_embedding's dimensions = %d, "
-            "But received c_embedding's dimensions = %d, "
+                          "c_embedding's shape = [%s].",
-            "c_embedding's shape = [%s].",
+                          table_dims.size(), table_dims));
-            table_dims.size(), table_dims));
    auto output_dims = framework::vectorize(ids_dims);
    output_dims.push_back(table_dims[1]);

--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h"
+namespace paddle {
+namespace operators {
+class CSoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Logits"), "Input", "Logits",
+                   "CSoftmaxWithCrossEntropyOp");
+    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label",
+                   "CSoftmaxWithCrossEntropyOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Softmax"), "Output", "Softmax",
+                   "CSoftmaxWithCrossEntropyOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Loss"), "Output", "Loss",
+                   "CSoftmaxWithCrossEntropyOp");
+    auto logits_dims = ctx->GetInputDim("Logits");
+    auto labels_dims = ctx->GetInputDim("Label");
+    auto logits_rank = logits_dims.size();
+    auto axis = logits_rank - 1;
+    for (int i = 0; i < logits_rank; i++) {
+      if (i != axis) {
+        if (ctx->IsRuntime() || (logits_dims[i] > 0 && labels_dims[i] > 0)) {
+          PADDLE_ENFORCE_EQ(logits_dims[i], labels_dims[i],
+                            platform::errors::InvalidArgument(
+                                "Input(Logits) and Input(Label) should in "
+                                "same shape in dimensions except axis."));
+        }
+      }
+    }
+    PADDLE_ENFORCE_EQ(
+        labels_dims[logits_rank - 1], 1UL,
+        platform::errors::InvalidArgument(
+            "the last dimension of Input(Label) should be 1."
+            "But received: the last dimension of Input(Label) is [%d],"
+            "the last dimension is [%d]",
+            labels_dims[logits_rank - 1], logits_rank - 1));
+    ctx->SetOutputDim("Softmax", logits_dims);
+    logits_dims[axis] = 1;
+    ctx->SetOutputDim("Loss", logits_dims);
+    ctx->ShareLoD("Logits", /*->*/ "Softmax");
+    ctx->ShareLoD("Logits", /*->*/ "Loss");
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "Logits"),
+        ctx.device_context());
+  }
+};
+class CSoftmaxWithCrossEntropyOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("Logits",
+             "(Tensor, default: Tensor<float>), The input tensor of unscaled "
+             "log probabilities, whose dimension :attr:`axis` should be scaled "
+             "by softmax.");
+    AddInput(
+        "Label",
+        "(Tensor) The input tensor of groud truth label. If :attr:`soft_label` "
+        "is set to false, Label is a Tensor<int64> in same shape with "
+        "Input(Logits) except the shape in dimension :attr:`axis` as 1. If "
+        "soft_label is set to true, Label is a Tensor<float/double> in same "
+        "shape with Input(Logits).");
+    AddOutput(
+        "Softmax",
+        "(Tensor, default: Tensor<float>), A tensor in same shape with "
+        "Input(Logits). "
+        "The outputs value of softmax activation by given the input batch, "
+        "which will be used in backward calculation.");
+    AddOutput("Loss",
+              "(Tensor, default: Tensor<float>), A tensor in same shape with "
+              "Input(Logits) "
+              "except the shape in dimension :attr:`axis` as 1. The cross "
+              "entropy loss.");
+    AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
+        .SetDefault(0);
+    AddAttr<int>("rank",
+                 "(int default 0) rank id for CSoftmaxWithCrossEntropy.")
+        .SetDefault(0);
+    AddAttr<int>("nranks",
+                 "(int default 1) nranks id for CSoftmaxWithCrossEntropy.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+CSoftmaxWithCrossEntropy Operator
+)DOC");
+  }
+};
+class CSoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Loss")), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Loss@Grad) should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Softmax"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Softmax) should be not null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("Label"), true,
+        platform::errors::InvalidArgument("Input(Label) should be not null."));
+    PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Logits")), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Logits@Grad) should be not null."));
+    ctx->SetOutputDim(framework::GradVarName("Logits"),
+                      ctx->GetInputDim("Softmax"));
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Loss")),
+                                   ctx.device_context());
+  }
+};
+template <typename T>
+class CSoftmaxWithCrossEntropyOpGradMaker
+    : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("c_softmax_with_cross_entropy_grad");
+    op->SetInput("Softmax", this->Output("Softmax"));
+    op->SetInput("Label", this->Input("Label"));
+    op->SetInput(framework::GradVarName("Loss"), this->OutputGrad("Loss"));
+    op->SetAttrMap(this->Attrs());
+    op->SetOutput(framework::GradVarName("Logits"), this->InputGrad("Logits"));
+  }
+};
+DECLARE_INPLACE_OP_INFERER(CSoftmaxWithCrossEntropyInplaceInferer,
+                           {"Logits", "Softmax"});
+DECLARE_INPLACE_OP_INFERER(CSoftmaxWithCrossEntropyGradInplaceInferer,
+                           {"Softmax", framework::GradVarName("Logits")});
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OPERATOR(
+    c_softmax_with_cross_entropy, ops::CSoftmaxWithCrossEntropyOp,
+    ops::CSoftmaxWithCrossEntropyOpMaker,
+    ops::CSoftmaxWithCrossEntropyOpGradMaker<paddle::framework::OpDesc>,
+    ops::CSoftmaxWithCrossEntropyOpGradMaker<paddle::imperative::OpBase>,
+    ops::CSoftmaxWithCrossEntropyInplaceInferer);
+REGISTER_OPERATOR(c_softmax_with_cross_entropy_grad,
+                  ops::CSoftmaxWithCrossEntropyOpGrad,
+                  ops::CSoftmaxWithCrossEntropyGradInplaceInferer);
+REGISTER_OP_CPU_KERNEL(c_softmax_with_cross_entropy,
+                       ops::CSoftmaxWithCrossEntropyOpCPUKernel<float>,
+                       ops::CSoftmaxWithCrossEntropyOpCPUKernel<double>,
+                       ops::CSoftmaxWithCrossEntropyOpCPUKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h"
+#include "paddle/fluid/operators/math/cross_entropy.h"
+#include "paddle/fluid/operators/math/softmax_impl.h"
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/string/string_helper.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+template <typename T, typename IndexT>
+__global__ void MaskLabelByIndex(T* predicted_logits, const T* logit,
+                                 const IndexT* label, const int start_index,
+                                 const int end_index, const int64_t N,
+                                 const int64_t D, const int nranks) {
+  CUDA_KERNEL_LOOP(i, N) {
+    auto real_label = label[i];
+    PADDLE_ENFORCE((real_label < D * nranks) && (real_label >= 0),
+                   "The index is out of bounds, "
+                   "please check whether the value of label and "
+                   "input meet the class number. It should "
+                   "be less than [%d], but received [%d]",
+                   D * nranks, real_label);
+    if (real_label >= start_index && real_label < end_index) {
+      predicted_logits[i] = logit[i * D + real_label - start_index];
+    }
+  }
+}
+template <typename T, typename IndexT>
+__global__ void MaskLabelByIndexGrad(T* logits_grad, const T* loss_grad,
+                                     const IndexT* labels,
+                                     const int start_index, const int end_index,
+                                     const int64_t N, const int64_t D) {
+  CUDA_KERNEL_LOOP(i, N * D) {
+    auto row = i / D;
+    auto col = i % D;
+    if ((col + start_index) == labels[row]) {
+      logits_grad[i] = (logits_grad[i] - static_cast<T>(1.0)) * loss_grad[row];
+    } else {
+      logits_grad[i] *= loss_grad[row];
+    }
+  }
+}
+template <typename T>
+class CSoftmaxWithCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* logits = ctx.Input<Tensor>("Logits");
+    const Tensor* labels = ctx.Input<Tensor>("Label");
+    Tensor* softmax = ctx.Output<Tensor>("Softmax");
+    Tensor* loss = ctx.Output<Tensor>("Loss");
+    const int rid = ctx.Attr<int>("ring_id");
+    const int nranks = ctx.Attr<int>("nranks");
+    const int rank = ctx.Attr<int>("rank");
+    const auto& place = ctx.GetPlace();
+    const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place);
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    // use global calculate stream
+    const auto stream = static_cast<platform::CUDADeviceContext*>(
+                            platform::DeviceContextPool::Instance().Get(place))
+                            ->stream();
+    // allocate memory on device.
+    softmax->mutable_data<T>(place);
+    loss->mutable_data<T>(place);
+    const auto& logits_dims = logits->dims();
+    const auto& labels_dims = labels->dims();
+    const int axis = logits_dims.size() - 1;
+    const int N = SizeToAxis(axis, logits_dims);
+    const int D = SizeFromAxis(axis, logits_dims);
+    Tensor logits_2d, softmax_2d, loss_2d;
+    logits_2d.ShareDataWith(*logits).Resize({N, D});
+    softmax_2d.ShareDataWith(*softmax).Resize({N, D});
+    loss_2d.ShareDataWith(*loss).Resize({N, 1});
+    auto eigen_logits = math::EigenMatrix<T>::From(logits_2d);
+    auto eigen_softmax = math::EigenMatrix<T>::From(softmax_2d);
+    // step 1, obtain logit_max
+    Tensor logits_max;
+    logits_max =
+        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
+    void* logits_max_buff = logits_max.mutable_data<T>(place);
+    auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max);
+    Eigen::DSizes<int, 1> along_axis(1);
+    eigen_logits_max.device(*dev_ctx.eigen_device()) =
+        eigen_logits.maximum(along_axis);
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+        logits_max_buff, logits_max_buff, logits_max.numel(),
+        platform::ToNCCLDataType(logits_max.type()), ncclMax, comm->comm(),
+        stream));
+    // step 2, obtain logit - logit_max
+    Eigen::DSizes<int, 2> batch_by_one(N, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, D);
+    eigen_softmax.device(*dev_ctx.eigen_device()) =
+        (eigen_logits -
+         eigen_logits_max.reshape(batch_by_one).broadcast(one_by_class))
+            .unaryExpr(math::ValueClip<T>());
+    // step 3, obtain predict target
+    Tensor predicted_logits;
+    predicted_logits =
+        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
+    predicted_logits.mutable_data<T>(place);
+    auto t = framework::EigenVector<T>::Flatten(predicted_logits);
+    t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
+    const int start_index = rank * D;
+    const int end_index = start_index + D;
+    int blocks = NumBlocks(N);
+    int threads = kNumCUDAThreads;
+    const auto& label_type = labels->type();
+    if (label_type == framework::proto::VarType::INT32) {
+      MaskLabelByIndex<T, int32_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          predicted_logits.data<T>(), softmax_2d.data<T>(),
+          labels->data<int32_t>(), start_index, end_index, N, D, nranks);
+    } else if (label_type == framework::proto::VarType::INT64) {
+      MaskLabelByIndex<T, int64_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          predicted_logits.data<T>(), softmax_2d.data<T>(),
+          labels->data<int64_t>(), start_index, end_index, N, D, nranks);
+    }
+    void* predict_logits_buff = predicted_logits.mutable_data<T>(place);
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+        predict_logits_buff, predict_logits_buff, predicted_logits.numel(),
+        platform::ToNCCLDataType(predicted_logits.type()), ncclSum,
+        comm->comm(), stream));
+    // step 4, obtain exp(logit)
+    eigen_softmax.device(*dev_ctx.eigen_device()) = eigen_softmax.exp();
+    // step 5, obtain sum_exp_logits
+    Tensor sum_exp_logits;
+    sum_exp_logits =
+        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
+    void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
+    auto eigen_sum_exp_logits = math::EigenMatrix<T>::From(sum_exp_logits);
+    eigen_sum_exp_logits.device(*dev_ctx.eigen_device()) =
+        eigen_softmax.sum(along_axis);
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+        sum_exp_logits_buff, sum_exp_logits_buff, sum_exp_logits.numel(),
+        platform::ToNCCLDataType(sum_exp_logits.type()), ncclSum, comm->comm(),
+        stream));
+    auto eigen_loss = math::EigenMatrix<T>::From(loss_2d);
+    auto eigen_predicted_logits = math::EigenMatrix<T>::From(predicted_logits);
+    eigen_loss.device(*dev_ctx.eigen_device()) =
+        (eigen_sum_exp_logits.log().unaryExpr(math::TolerableValue<T>()) -
+         eigen_predicted_logits)
+            .unaryExpr(math::TolerableValue<T>());
+    eigen_softmax.device(*dev_ctx.eigen_device()) =
+        (eigen_softmax *
+         eigen_sum_exp_logits.inverse().broadcast(one_by_class));
+  }
+};
+template <typename T>
+class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* labels = context.Input<Tensor>("Label");
+    const Tensor* loss_grad =
+        context.Input<Tensor>(framework::GradVarName("Loss"));
+    Tensor* logit_grad =
+        context.Output<Tensor>(framework::GradVarName("Logits"));
+    const Tensor* softmax = context.Input<Tensor>("Softmax");
+    const int rank = context.Attr<int>("rank");
+    auto& dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    if (logit_grad != softmax) {
+      framework::TensorCopy(*softmax, context.GetPlace(),
+                            context.device_context(), logit_grad);
+    }
+    const auto sofrmax_dims = softmax->dims();
+    const int axis = sofrmax_dims.size() - 1;
+    const int N = SizeToAxis(axis, sofrmax_dims);
+    const int D = SizeFromAxis(axis, sofrmax_dims);
+    Tensor logit_grad_2d;
+    logit_grad_2d.ShareDataWith(*logit_grad).Resize({N, D});
+    int blocks = NumBlocks(N * D);
+    int threads = kNumCUDAThreads;
+    const auto& label_type = labels->type();
+    const int start_index = rank * D;
+    const int end_index = start_index + D;
+    if (label_type == framework::proto::VarType::INT32) {
+      MaskLabelByIndexGrad<T,
+                           int32_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          logit_grad_2d.data<T>(), loss_grad->data<T>(),
+          labels->data<int32_t>(), start_index, end_index, N, D);
+    } else if (label_type == framework::proto::VarType::INT64) {
+      MaskLabelByIndexGrad<T,
+                           int64_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          logit_grad_2d.data<T>(), loss_grad->data<T>(),
+          labels->data<int64_t>(), start_index, end_index, N, D);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(
+    c_softmax_with_cross_entropy,
+    ops::CSoftmaxWithCrossEntropyOpCUDAKernel<float>,
+    ops::CSoftmaxWithCrossEntropyOpCUDAKernel<double>,
+    ops::CSoftmaxWithCrossEntropyOpCUDAKernel<plat::float16>);
+REGISTER_OP_CUDA_KERNEL(
+    c_softmax_with_cross_entropy_grad,
+    ops::CSoftmaxWithCrossEntropyGradCUDAKernel<float>,
+    ops::CSoftmaxWithCrossEntropyGradCUDAKernel<paddle::platform::float16>,
+    ops::CSoftmaxWithCrossEntropyGradCUDAKernel<double>);
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/cross_entropy.h"
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/operators/softmax_op.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+class CSoftmaxWithCrossEntropyOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support c_embedding for cpu kernel now."));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/compat/affine_channel.pbtxt
+++ b/paddle/fluid/operators/compat/affine_channel.pbtxt
+type: "affine_channel"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Scale"
+  }
+  inputs {
+    name: "Bias"
+  }
+  attrs {
+    name: "data_layout"
+    type: STRING
+  }
+  outputs {
+    name: "Out"
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
--- a/paddle/fluid/operators/compat/flatten2.pbtxt
+++ b/paddle/fluid/operators/compat/flatten2.pbtxt
+type: "flatten2"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  outputs {
+    name: "XShape"
+  }
+  attrs {
+    name: "axis"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
--- a/paddle/fluid/operators/compat/gru.pbtxt
+++ b/paddle/fluid/operators/compat/gru.pbtxt
+type: "gru"
+def {
+  inputs {
+    name: "Input"
+  }
+  inputs {
+    name: "H0"
+  }
+  inputs {
+    name: "Weight"
+  }
+  inputs {
+    name: "Bias"
+  }
+  outputs {
+    name: "BatchGate"
+  }
+  outputs {
+    name: "BatchResetHiddenPrev"
+  }
+  outputs {
+    name: "BatchHidden"
+  }
+  outputs {
+    name: "Hidden"
+  }
+  attrs {
+    name: "activation"
+    type: STRING
+  }
+  attrs {
+    name: "gate_activation"
+    type: STRING
+  }
+  attrs {
+    name: "is_reverse"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "origin_mode"
+    type: BOOLEAN
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
--- a/paddle/fluid/operators/compat/layer_norm.pbtxt
+++ b/paddle/fluid/operators/compat/layer_norm.pbtxt
+type: "layer_norm"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Scale"
+  }
+  inputs {
+    name: "Bias"
+  }
+  outputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Mean"
+  }
+  outputs {
+    name: "Variance"
+  }
+  attrs {
+    name: "epsilon"
+    type: FLOAT
+  }
+  attrs {
+    name: "begin_norm_axis"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
--- a/paddle/fluid/operators/compat/lstm.pbtxt
+++ b/paddle/fluid/operators/compat/lstm.pbtxt
+type: "lstm"
+def {
+  inputs {
+    name: "Input"
+  }
+  inputs {
+    name: "H0"
+  }
+  inputs {
+    name: "C0"
+  }
+  inputs {
+    name: "Weight"
+  }
+  inputs {
+    name: "Bias"
+  }
+  outputs {
+    name: "Hidden"
+  }
+  outputs {
+    name: "Cell"
+  }
+  outputs {
+    name: "BatchGate"
+  }
+  outputs {
+    name: "BatchCellPreAct"
+  }
+  attrs {
+    name: "use_peepholes"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "is_reverse"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "gate_activation"
+    type: STRING
+  }
+  attrs {
+    name: "cell_activation"
+    type: STRING
+  }
+  attrs {
+    name: "candidate_activation"
+    type: STRING
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
--- a/paddle/fluid/operators/compat/matmul.pbtxt
+++ b/paddle/fluid/operators/compat/matmul.pbtxt
+type: "matmul"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "alpha"
+    type: FLOAT
+  }
+  attrs {
+    name: "transpose_X"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "transpose_Y"
+    type: BOOLEAN
+  }
+}
+extra {
+  attrs {
+    name: "Scale_out"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_y"
+    type: FLOAT
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "force_fp32_output"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "fused_reshape_Out"
+    type: INTS
+  }
+  attrs {
+    name: "fused_reshape_X"
+    type: INTS
+  }
+  attrs {
+    name: "fused_reshape_Y"
+    type: INTS
+  }
+  attrs {
+    name: "fused_transpose_Out"
+    type: INTS
+  }
+  attrs {
+    name: "fused_transpose_X"
+    type: INTS
+  }
+  attrs {
+    name: "fused_transpose_Y"
+    type: INTS
+  }
+}
--- a/paddle/fluid/operators/compat/matmul_v2.pbtxt
+++ b/paddle/fluid/operators/compat/matmul_v2.pbtxt
+type: "matmul_v2"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "trans_x"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "trans_y"
+    type: BOOLEAN
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
--- a/paddle/fluid/operators/compat/pool2d.pbtxt
+++ b/paddle/fluid/operators/compat/pool2d.pbtxt
+type: "pool2d"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "pooling_type"
+    type: STRING
+  }
+  attrs {
+    name: "ksize"
+    type: INTS
+  }
+  attrs {
+    name: "global_pooling"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "strides"
+    type: INTS
+  }
+  attrs {
+    name: "paddings"
+    type: INTS
+  }
+  attrs {
+    name: "exclusive"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "adaptive"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "ceil_mode"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "data_format"
+    type: STRING
+  }
+  attrs {
+    name: "padding_algorithm"
+    type: STRING
+  }
+}
+extra {
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_cudnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
--- a/paddle/fluid/operators/compat/reshape2.pbtxt
+++ b/paddle/fluid/operators/compat/reshape2.pbtxt
+type: "reshape2"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "shape"
+    type: INTS
+  }
+}
+extra {
+  inputs {
+    name: "Shape"
+  }
+  inputs {
+    name: "ShapeTensor"
+  }
+  outputs {
+    name: "XShape"
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
--- a/paddle/fluid/operators/compat/scale.pbtxt
+++ b/paddle/fluid/operators/compat/scale.pbtxt
+type: "scale"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "bias"
+    type: FLOAT
+  }
+  attrs {
+    name: "scale"
+    type: FLOAT
+  }
+  attrs {
+    name: "bias_after_scale"
+    type: BOOLEAN
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
--- a/paddle/fluid/operators/compat/softmax.pbtxt
+++ b/paddle/fluid/operators/compat/softmax.pbtxt
+type: "softmax"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INT
+  }
+  attrs {
+    name: "data_format"
+    type: STRING
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "use_cudnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+}
--- a/paddle/fluid/operators/compat/squeeze2.pbtxt
+++ b/paddle/fluid/operators/compat/squeeze2.pbtxt
+type: "squeeze2"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  outputs {
+    name: "XShape"
+  }
+  attrs {
+    name: "axes"
+    type: INTS
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
--- a/paddle/fluid/operators/compat/transpose.pdtxt
+++ b/paddle/fluid/operators/compat/transpose.pdtxt
+type: "transpose"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INTS
+  }
+  attrs {
+    name: "data_format"
+    type: STRING
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
--- a/paddle/fluid/operators/compat/transpose2.pdtxt
+++ b/paddle/fluid/operators/compat/transpose2.pdtxt
+type: "transpose"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INTS
+  }
+  attrs {
+    name: "data_format"
+    type: STRING
+  }
+}
+extra {
+  outputs {
+    name: "XShape"
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
--- a/paddle/fluid/operators/compat/unsqueeze2.pbtxt
+++ b/paddle/fluid/operators/compat/unsqueeze2.pbtxt
+type: "unsqueeze2"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "AxesTensor"
+  }
+  inputs {
+    name: "AxesTensorList"
+  }
+  outputs {
+    name: "Out"
+  }
+  outputs {
+    name: "XShape"
+  }
+  attrs {
+    name: "axes"
+    type: INTS
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -11,6 +11,7 @@
 #include "paddle/fluid/operators/detection/yolo_box_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 namespace paddle {
 namespace operators {
@@ -31,19 +32,44 @@ class YoloBoxOp : public framework::OperatorWithKernel {
    auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors");
    int anchor_num = anchors.size() / 2;
    auto class_num = ctx->Attrs().Get<int>("class_num");
+    auto iou_aware = ctx->Attrs().Get<bool>("iou_aware");
+    auto iou_aware_factor = ctx->Attrs().Get<float>("iou_aware_factor");
    PADDLE_ENFORCE_EQ(dim_x.size(), 4, platform::errors::InvalidArgument(
                                           "Input(X) should be a 4-D tensor."
                                           "But received X dimension(%s)",
                                           dim_x.size()));
-    PADDLE_ENFORCE_EQ(
+    if (iou_aware) {
-        dim_x[1], anchor_num * (5 + class_num),
+      PADDLE_ENFORCE_EQ(
-        platform::errors::InvalidArgument(
+          dim_x[1], anchor_num * (6 + class_num),
-            "Input(X) dim[1] should be equal to (anchor_mask_number * (5 "
+          platform::errors::InvalidArgument(
-            "+ class_num))."
+              "Input(X) dim[1] should be equal to (anchor_mask_number * (6 "
-            "But received dim[1](%s) != (anchor_mask_number * "
+              "+ class_num)) while iou_aware is true."
-            "(5+class_num)(%s).",
+              "But received dim[1](%s) != (anchor_mask_number * "
-            dim_x[1], anchor_num * (5 + class_num)));
+              "(6+class_num)(%s).",
+              dim_x[1], anchor_num * (6 + class_num)));
+      PADDLE_ENFORCE_GE(
+          iou_aware_factor, 0,
+          platform::errors::InvalidArgument(
+              "Attr(iou_aware_factor) should greater than or equal to 0."
+              "But received iou_aware_factor (%s)",
+              iou_aware_factor));
+      PADDLE_ENFORCE_LE(
+          iou_aware_factor, 1,
+          platform::errors::InvalidArgument(
+              "Attr(iou_aware_factor) should less than or equal to 1."
+              "But received iou_aware_factor (%s)",
+              iou_aware_factor));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          dim_x[1], anchor_num * (5 + class_num),
+          platform::errors::InvalidArgument(
+              "Input(X) dim[1] should be equal to (anchor_mask_number * (5 "
+              "+ class_num))."
+              "But received dim[1](%s) != (anchor_mask_number * "
+              "(5+class_num)(%s).",
+              dim_x[1], anchor_num * (5 + class_num)));
+    }
    PADDLE_ENFORCE_EQ(dim_imgsize.size(), 2,
                      platform::errors::InvalidArgument(
                          "Input(ImgSize) should be a 2-D tensor."
@@ -140,6 +166,10 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
                   "Scale the center point of decoded bounding "
                   "box. Default 1.0")
        .SetDefault(1.);
+    AddAttr<bool>("iou_aware", "Whether use iou aware. Default false.")
+        .SetDefault(false);
+    AddAttr<float>("iou_aware_factor", "iou aware factor. Default 0.5.")
+        .SetDefault(0.5);
    AddComment(R"DOC(
         This operator generates YOLO detection boxes from output of YOLOv3 network.
@@ -147,7 +177,8 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
         should be the same, H and W specify the grid size, each grid point predict 
         given number boxes, this given number, which following will be represented as S,
         is specified by the number of anchors. In the second dimension(the channel
-         dimension), C should be equal to S * (5 + class_num), class_num is the object 
+         dimension), C should be equal to S * (5 + class_num) if :attr:`iou_aware` is false,
+         otherwise C should be equal to S * (6 + class_num). class_num is the object
         category number of source dataset(such as 80 in coco dataset), so the 
         second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
         also includes confidence score of the box and class one-hot key of each anchor 
@@ -183,6 +214,15 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
         score_{pred} = score_{conf} * score_{class}
         $$
+         where the confidence scores follow the formula bellow
+         .. math::
+            score_{conf} = \begin{case}
+                             obj, \text{if } iou_aware == flase \\
+                             obj^{1 - iou_aware_factor} * iou^{iou_aware_factor}, \text{otherwise}
+                           \end{case}
         )DOC");
  }
 };
@@ -197,3 +237,12 @@ REGISTER_OPERATOR(
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(yolo_box, ops::YoloBoxKernel<float>,
                       ops::YoloBoxKernel<double>);
+REGISTER_OP_VERSION(yolo_box)
+    .AddCheckpoint(
+        R"ROC(
+      Upgrade yolo box to add new attribute [iou_aware, iou_aware_factor].
+    )ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewAttr("iou_aware", "Whether use iou aware", false)
+            .NewAttr("iou_aware_factor", "iou aware factor", 0.5f));
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
@@ -28,7 +28,8 @@ __global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
                            const int w, const int an_num, const int class_num,
                            const int box_num, int input_size_h,
                            int input_size_w, bool clip_bbox, const float scale,
-                            const float bias) {
+                            const float bias, bool iou_aware,
+                            const float iou_aware_factor) {
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
  T box[4];
@@ -43,23 +44,29 @@ __global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
    int img_height = imgsize[2 * i];
    int img_width = imgsize[2 * i + 1];
-    int obj_idx =
+    int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4,
-        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4);
+                                iou_aware);
    T conf = sigmoid<T>(input[obj_idx]);
+    if (iou_aware) {
+      int iou_idx = GetIoUIndex(i, j, k * w + l, an_num, an_stride, grid_num);
+      T iou = sigmoid<T>(input[iou_idx]);
+      conf = pow(conf, static_cast<T>(1. - iou_aware_factor)) *
+             pow(iou, static_cast<T>(iou_aware_factor));
+    }
    if (conf < conf_thresh) {
      continue;
    }
-    int box_idx =
+    int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0,
-        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0);
+                                iou_aware);
    GetYoloBox<T>(box, input, anchors, l, k, j, h, w, input_size_h,
                  input_size_w, box_idx, grid_num, img_height, img_width, scale,
                  bias);
    box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
    CalcDetectionBox<T>(boxes, box, box_idx, img_height, img_width, clip_bbox);
-    int label_idx =
+    int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num,
-        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5);
+                                  5, iou_aware);
    int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num;
    CalcLabelScore<T>(scores, input, label_idx, score_idx, class_num, conf,
                      grid_num);
@@ -80,6 +87,8 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
    float conf_thresh = ctx.Attr<float>("conf_thresh");
    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
    bool clip_bbox = ctx.Attr<bool>("clip_bbox");
+    bool iou_aware = ctx.Attr<bool>("iou_aware");
+    float iou_aware_factor = ctx.Attr<float>("iou_aware_factor");
    float scale = ctx.Attr<float>("scale_x_y");
    float bias = -0.5 * (scale - 1.);
@@ -111,11 +120,18 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
    platform::GpuLaunchConfig config =
        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), n * box_num);
-    KeYoloBoxFw<T><<<config.block_per_grid, config.thread_per_block, 0,
+    dim3 thread_num = config.thread_per_block;
+#ifdef WITH_NV_JETSON
+    if (config.compute_capability == 53 || config.compute_capability == 62) {
+      thread_num = 512;
+    }
+#endif
+    KeYoloBoxFw<T><<<config.block_per_grid, thread_num, 0,
                     ctx.cuda_device_context().stream()>>>(
        input_data, imgsize_data, boxes_data, scores_data, conf_thresh,
        anchors_data, n, h, w, an_num, class_num, box_num, input_size_h,
-        input_size_w, clip_bbox, scale, bias);
+        input_size_w, clip_bbox, scale, bias, iou_aware, iou_aware_factor);
  }
 };

--- a/paddle/fluid/operators/detection/yolo_box_op.h
+++ b/paddle/fluid/operators/detection/yolo_box_op.h
@@ -13,6 +13,7 @@
 #include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/hostdevice.h"
 namespace paddle {
@@ -43,8 +44,19 @@ HOSTDEVICE inline void GetYoloBox(T* box, const T* x, const int* anchors, int i,
 HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
                                    int an_num, int an_stride, int stride,
-                                    int entry) {
+                                    int entry, bool iou_aware) {
-  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+  if (iou_aware) {
+    return (batch * an_num + an_idx) * an_stride +
+           (batch * an_num + an_num + entry) * stride + hw_idx;
+  } else {
+    return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+  }
+}
+HOSTDEVICE inline int GetIoUIndex(int batch, int an_idx, int hw_idx, int an_num,
+                                  int an_stride, int stride) {
+  return batch * an_num * an_stride + (batch * an_num + an_idx) * stride +
+         hw_idx;
 }
 template <typename T>
@@ -92,6 +104,8 @@ class YoloBoxKernel : public framework::OpKernel<T> {
    float conf_thresh = ctx.Attr<float>("conf_thresh");
    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
    bool clip_bbox = ctx.Attr<bool>("clip_bbox");
+    bool iou_aware = ctx.Attr<bool>("iou_aware");
+    float iou_aware_factor = ctx.Attr<float>("iou_aware_factor");
    float scale = ctx.Attr<float>("scale_x_y");
    float bias = -0.5 * (scale - 1.);
@@ -127,15 +141,22 @@ class YoloBoxKernel : public framework::OpKernel<T> {
      for (int j = 0; j < an_num; j++) {
        for (int k = 0; k < h; k++) {
          for (int l = 0; l < w; l++) {
-            int obj_idx =
+            int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride,
-                GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 4);
+                                        stride, 4, iou_aware);
            T conf = sigmoid<T>(input_data[obj_idx]);
+            if (iou_aware) {
+              int iou_idx =
+                  GetIoUIndex(i, j, k * w + l, an_num, an_stride, stride);
+              T iou = sigmoid<T>(input_data[iou_idx]);
+              conf = pow(conf, static_cast<T>(1. - iou_aware_factor)) *
+                     pow(iou, static_cast<T>(iou_aware_factor));
+            }
            if (conf < conf_thresh) {
              continue;
            }
-            int box_idx =
+            int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride,
-                GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 0);
+                                        stride, 0, iou_aware);
            GetYoloBox<T>(box, input_data, anchors_data, l, k, j, h, w,
                          input_size_h, input_size_w, box_idx, stride,
                          img_height, img_width, scale, bias);
@@ -143,8 +164,8 @@ class YoloBoxKernel : public framework::OpKernel<T> {
            CalcDetectionBox<T>(boxes_data, box, box_idx, img_height, img_width,
                                clip_bbox);
-            int label_idx =
+            int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride,
-                GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 5);
+                                          stride, 5, iou_aware);
            int score_idx = (i * box_num + j * stride + k * w + l) * class_num;
            CalcLabelScore<T>(scores_data, input_data, label_idx, score_idx,
                              class_num, conf, stride);

--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
@@ -209,6 +209,73 @@ __global__ void LayerNormForward(const T *x, const U *scale, const U *bias,
  }
 }
+template <typename T, typename U, int BlockDim>
+__global__ void LayerNormForwardFP16(const T *x, const U *scale, const U *bias,
+                                     T *y, U *mean, U *var, float epsilon,
+                                     int feature_size) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+  using BlockReduce = cub::BlockReduce<PairForLayerNorm<U>, BlockDim>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  __shared__ U mean_share;
+  __shared__ U var_share;
+  int beg_idx = blockIdx.x * feature_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * feature_size;
+  // Step 1: Reduce to calculate mean and var
+  U mean_val = 0;
+  U var_val = 0;
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    U tmp = static_cast<U>(x[i]);
+    mean_val += tmp;
+    var_val += (tmp * tmp);
+  }
+  auto pair = BlockReduce(temp_storage)
+                  .Reduce(PairForLayerNorm<U>(mean_val, var_val),
+                          PairForLayerNormAddFunctor<U>());
+  if (threadIdx.x == 0) {
+    auto tmp = pair.first_ / static_cast<U>(feature_size);
+    mean[blockIdx.x] = mean_share = static_cast<U>(tmp);
+    var[blockIdx.x] = var_share =
+        static_cast<U>(pair.second_ / static_cast<U>(feature_size) - tmp * tmp);
+  }
+  __syncthreads();
+  mean_val = mean_share;
+  U invvar = rsqrt_<U>(var_share + static_cast<U>(epsilon));
+  // Step 2: Calculate y
+  if (scale != nullptr) {
+    if (bias != nullptr) {
+      for (int i = beg_idx, j = threadIdx.x; i < end_idx;
+           i += BlockDim, j += BlockDim) {
+        y[i] = static_cast<T>(
+            scale[j] * (static_cast<U>(x[i]) - mean_val) * invvar + bias[j]);
+      }
+    } else {
+      for (int i = beg_idx, j = threadIdx.x; i < end_idx;
+           i += BlockDim, j += BlockDim) {
+        y[i] = static_cast<T>(scale[j] * (static_cast<U>(x[i]) - mean_val) *
+                              invvar);
+      }
+    }
+  } else {  // scale == nullptr
+    if (bias != nullptr) {
+      for (int i = beg_idx, j = threadIdx.x; i < end_idx;
+           i += BlockDim, j += BlockDim) {
+        y[i] = static_cast<T>((static_cast<U>(x[i]) - mean_val) * invvar +
+                              bias[j]);
+      }
+    } else {
+      for (int i = beg_idx, j = threadIdx.x; i < end_idx;
+           i += BlockDim, j += BlockDim) {
+        y[i] = static_cast<T>((static_cast<U>(x[i]) - mean_val) * invvar);
+      }
+    }
+  }
+#endif
+}
 template <typename T, typename U, int VPT>
 __inline__ __device__ void cuLoadAddStridedInputs(
    const int i1_block, const int thr_load_row_off, const int thr_load_col_off,
@@ -872,6 +939,28 @@ void LayerNormDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
  }
 }
+template <>
+void LayerNormDirectCUDAFunctor<half>::operator()(
+    gpuStream_t stream, const half *input, std::vector<int> input_shape,
+    const half *bias, const half *scale, half *output, half *mean,
+    half *variance, int begin_norm_axis, float eps) {
+  const auto x_dims = framework::make_ddim(input_shape);
+  auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+  int batch_size = static_cast<int>(matrix_dim[0]);
+  int feature_size = static_cast<int>(matrix_dim[1]);
+  switch (GetDesiredBlockDim(feature_size)) {
+    FIXED_BLOCK_DIM_CASE(
+        LayerNormForwardFP16<half, half,
+                             kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
+            input, scale, bias, output, mean, variance, eps, feature_size));
+    default:
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Product from begin_norm_axis to end in layer_norm must be larger "
+          "than 1"));
+      break;
+  }
+}
 template <typename T>
 class LayerNormKernel<platform::CUDADeviceContext, T>
    : public framework::OpKernel<T> {
@@ -961,6 +1050,9 @@ class LayerNormGradKernel<platform::CUDADeviceContext, T>
 };
 template class LayerNormDirectCUDAFunctor<float>;
+#ifdef TRT_PLUGIN_FP16_AVALIABLE
+template class LayerNormDirectCUDAFunctor<half>;
+#endif
 #undef FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE_BASE
 #undef FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE

--- a/paddle/fluid/operators/py_layer_op.cc
+++ b/paddle/fluid/operators/py_layer_op.cc
@@ -62,13 +62,22 @@ void RunPyObject(py::object *py_object,
    for (size_t i = 0; i < result_tuple.size(); i++) {
      if ((*outs)[i] != nullptr) {
        if (Py_None != result_tuple[i].ptr()) {
-          try {
+          if (py::isinstance<imperative::VarBase>(result_tuple[i])) {
-            auto result_var =
+            try {
-                result_tuple[i].cast<std::shared_ptr<imperative::VarBase>>();
+              auto result_var =
-            *(*outs)[i] = result_var->Var();
+                  result_tuple[i].cast<std::shared_ptr<imperative::VarBase>>();
-          } catch (py::cast_error &) {
+              *(*outs)[i] = result_var->Var();
+            } catch (py::cast_error &) {
+              PADDLE_THROW(platform::errors::InvalidArgument(
+                  "The `PyLayer.backward` function returns invalid argument, "
+                  "the `%s` type argument can not be cast into `Tensor`.",
+                  result_tuple[i].ptr()->ob_type->tp_name));
+            }
+          } else {
            PADDLE_THROW(platform::errors::InvalidArgument(
-                "The output of `PyLayer.backward` should be `Tensor`."));
+                "The output of `PyLayer.backward` should be `Tensor`, but "
+                "received `%s`.",
+                result_tuple[i].ptr()->ob_type->tp_name));
          }
        } else {
          PADDLE_THROW(platform::errors::InvalidArgument(
@@ -94,13 +103,22 @@ void RunPyObject(py::object *py_object,
    }
    if ((*outs)[0] != nullptr) {
      if (Py_None != py_result.ptr()) {
-        try {
+        if (py::isinstance<imperative::VarBase>(py_result)) {
-          auto result_var =
+          try {
-              py_result.cast<std::shared_ptr<imperative::VarBase>>();
+            auto result_var =
-          *((*outs)[0]) = result_var->Var();
+                py_result.cast<std::shared_ptr<imperative::VarBase>>();
-        } catch (py::cast_error &) {
+            *((*outs)[0]) = result_var->Var();
+          } catch (py::cast_error &) {
+            PADDLE_THROW(platform::errors::InvalidArgument(
+                "The `PyLayer.backward` function returns invalid argument, the "
+                "`%s` type argument can not be cast into `Tensor`.",
+                py_result.ptr()->ob_type->tp_name));
+          }
+        } else {
          PADDLE_THROW(platform::errors::InvalidArgument(
-              "The output of `PyLayer.backward` should be `Tensor`."));
+              "The output of `PyLayer.backward` should be `Tensor`, but "
+              "received `%s`",
+              py_result.ptr()->ob_type->tp_name));
        }
      } else {
        PADDLE_THROW(platform::errors::InvalidArgument(

--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -68,7 +68,6 @@ BufferedReader::BufferedReader(
    stream_ = platform::NpuStreamResourcePool::Instance().New(dev_idx);
  }
 #endif
-  is_same_place_ = false;
  cpu_buffer_.resize(buffer_size);
  cuda_buffer_.resize(buffer_size);
  npu_buffer_.resize(buffer_size);
@@ -116,7 +115,7 @@ void BufferedReader::ReadAsync(size_t i) {
        std::vector<void *> cuda_pinned_ptrs;
        cuda_pinned_ptrs.reserve(cpu.size());
        platform::RecordEvent record_event("BufferedReader:MemoryCopy");
-        // NODE(chenwehiang): When we use CUDAPinned Memory, we need call
+        // NODE(chenweihang): When we use CUDAPinned Memory, we need call
        // cudaHostAlloc, that is a CUDA API, calling CUDA API need load
        // cuda lib into device, it will cost hundreds of MB of GPU memory.
        // If we don't set Device here, which will use CUDAPlace(0) default.
@@ -126,18 +125,21 @@ void BufferedReader::ReadAsync(size_t i) {
          if (platform::is_cpu_place(cpu[i].place())) {
            cuda[i].Resize(cpu[i].dims());
            cuda[i].set_layout(cpu[i].layout());
-            cuda_pinned_ptrs.emplace_back(
+            cuda_pinned_ptrs[i] =
-                cuda[i].mutable_data(cuda_pinned_place, cpu[i].type()));
+                cuda[i].mutable_data(cuda_pinned_place, cpu[i].type());
            auto size =
                cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
            memory::Copy(cuda_pinned_place, cuda_pinned_ptrs[i],
                         BOOST_GET_CONST(platform::CPUPlace, cpu[i].place()),
                         cpu[i].data<void>(), size);
            cuda[i].set_lod(cpu[i].lod());
          } else {
-            // we set same place flag & use cpu[i] directly
+            // Here the cpu[i]'s place may be CUDAPlace, CUDAPinnedPlace, or
-            is_same_place_ = true;
+            // others, we don't copy the memory of it to CUDAPinnedPlace, but
+            // we should share tensor data to cuda[i]
+            cuda[i].ShareDataWith(cpu[i]);
          }
        }
      } else {
@@ -296,9 +298,9 @@ void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
    return;
  }
-  if (platform::is_gpu_place(place_) && !is_same_place_) {
+  if (platform::is_gpu_place(place_)) {
    *out = std::move(cuda_buffer_[i]);
-  } else if (platform::is_npu_place(place_) && !is_same_place_) {
+  } else if (platform::is_npu_place(place_)) {
    *out = std::move(npu_buffer_[i]);
  } else {
    *out = std::move(cpu_buffer_[i]);

--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -67,7 +67,6 @@ class BufferedReader : public framework::DecoratedReader {
  // buffer, just read async and create futures as buffer size. However, to
  // malloc tensors every time is extremely slow. Here we store all data in
  // buffers and prevent alloc every time.
-  bool is_same_place_;
  std::vector<TensorVec> cpu_buffer_;
  std::vector<TensorVec> cuda_buffer_;
  std::vector<TensorVec> npu_buffer_;

--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
@@ -124,8 +124,10 @@ __global__ void GPUROIAlignForward(
    T roi_width = roi_xmax - roi_xmin;
    T roi_height = roi_ymax - roi_ymin;
-    roi_width = max(roi_width, static_cast<T>(1.));
+    if (!continuous_coordinate) {
-    roi_height = max(roi_height, static_cast<T>(1.));
+      roi_width = max(roi_width, static_cast<T>(1.));
+      roi_height = max(roi_height, static_cast<T>(1.));
+    }
    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
@@ -138,7 +140,7 @@ __global__ void GPUROIAlignForward(
                             : ceil(roi_height / pooled_height);
    int roi_bin_grid_w =
        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-    const T count = roi_bin_grid_h * roi_bin_grid_w;
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
    T output_val = 0;
    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
      const T y = roi_ymin + ph * bin_size_h +
@@ -180,9 +182,10 @@ __global__ void GPUROIAlignBackward(
    T roi_width = roi_xmax - roi_xmin;
    T roi_height = roi_ymax - roi_ymin;
-    roi_width = max(roi_width, static_cast<T>(1.));
+    if (!continuous_coordinate) {
-    roi_height = max(roi_height, static_cast<T>(1.));
+      roi_width = max(roi_width, static_cast<T>(1.));
+      roi_height = max(roi_height, static_cast<T>(1.));
+    }
    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

--- a/paddle/fluid/operators/roi_align_op.h
+++ b/paddle/fluid/operators/roi_align_op.h
@@ -226,8 +226,10 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
      T roi_width = roi_xmax - roi_xmin;
      T roi_height = roi_ymax - roi_ymin;
-      roi_width = std::max(roi_width, static_cast<T>(1.));
+      if (!aligned) {
-      roi_height = std::max(roi_height, static_cast<T>(1.));
+        roi_width = std::max(roi_width, static_cast<T>(1.));
+        roi_height = std::max(roi_height, static_cast<T>(1.));
+      }
      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
@@ -239,7 +241,7 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
      int roi_bin_grid_w = (sampling_ratio > 0)
                               ? sampling_ratio
                               : ceil(roi_width / pooled_width);
-      const T count = roi_bin_grid_h * roi_bin_grid_w;
+      const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);
      Tensor pre_pos;
      Tensor pre_w;
      int pre_size = count * out_stride[1];
@@ -362,6 +364,10 @@ class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
      T roi_height = roi_ymax - roi_ymin;
      roi_width = std::max(roi_width, static_cast<T>(1.));
      roi_height = std::max(roi_height, static_cast<T>(1.));
+      if (!aligned) {
+        roi_width = std::max(roi_width, static_cast<T>(1.));
+        roi_height = std::max(roi_height, static_cast<T>(1.));
+      }
      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);

--- a/paddle/fluid/operators/set_value_op.h
+++ b/paddle/fluid/operators/set_value_op.h
@@ -23,6 +23,7 @@
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/assign_value_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/slice_utils.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -59,106 +60,6 @@ inline std::string GetValueName(framework::proto::VarType::Type data_type) {
  return value_name;
 }
-inline void CheckAndUpdateSlice(const framework::DDim in_dims,
-                                const std::vector<int64_t> axes,
-                                std::vector<int64_t>* starts,
-                                std::vector<int64_t>* ends,
-                                std::vector<int64_t>* steps) {
-  for (size_t i = 0; i < axes.size(); ++i) {
-    int64_t axis = axes[i];
-    int64_t dim_value = in_dims[axis];
-    int64_t start =
-        (*starts)[i] < 0 ? ((*starts)[i] + dim_value) : (*starts)[i];
-    int64_t end = (*ends)[i] < 0 ? ((*ends)[i] + dim_value) : (*ends)[i];
-    start = std::max(start, static_cast<int64_t>(0));
-    end = std::min(end, dim_value);
-    int64_t step = (*steps)[i];
-    PADDLE_ENFORCE_NE(
-        step, 0, platform::errors::InvalidArgument(
-                     "Step should not be 0, but received step = %d.", step));
-    if (step > 0) {
-      start = std::min(start, dim_value);
-      end = std::max(end, static_cast<int64_t>(0));
-      PADDLE_ENFORCE_GT(
-          end, start,
-          platform::errors::InvalidArgument(
-              "When step > 0, end should be greater than start, but "
-              "received end = %d, start = %d.",
-              end, start));
-    } else {
-      // NOTE(liym27): When step < 0, start should less and equal to dim_value-1
-      // "end is -1" means contain the 0-th element of this axis.
-      start = std::min(start, dim_value - 1);
-      end = std::max(end, static_cast<int64_t>(-1));
-      PADDLE_ENFORCE_GT(
-          start, end,
-          platform::errors::InvalidArgument(
-              "When step < 0, start should be greater than end, but "
-              "received start = %d, end = %d.",
-              start, end));
-    }
-    (*starts)[i] = start;
-    (*ends)[i] = end;
-  }
-}
-inline framework::DDim GetSliceDims(const framework::DDim in_dims,
-                                    const std::vector<int64_t>& axes,
-                                    const std::vector<int64_t>& starts,
-                                    const std::vector<int64_t>& ends,
-                                    const std::vector<int64_t>& steps) {
-  framework::DDim slice_dims(in_dims);
-  for (size_t i = 0; i < axes.size(); ++i) {
-    int64_t axis = axes[i];
-    int64_t start = starts[i];
-    int64_t end = ends[i];
-    int64_t step = steps[i];
-    if (step > 0) {
-      slice_dims[axis] = (end - start + step - 1) / step;
-    } else {
-      slice_dims[axis] = (end - start + step + 1) / step;
-    }
-  }
-  return slice_dims;
-}
-inline framework::DDim GetDecreasedDims(
-    const framework::DDim slice_dims,
-    const std::vector<int64_t>& decrease_axes) {
-  // Get dims after decreasing axes.
-  framework::DDim decreased_dims(slice_dims);
-  if (decrease_axes.size() > 0) {
-    for (size_t i = 0; i < decrease_axes.size(); ++i) {
-      int64_t axis = decrease_axes[i];
-      PADDLE_ENFORCE_EQ(
-          decreased_dims[axis], 1,
-          platform::errors::InvalidArgument("decrease dim should be 1"));
-      decreased_dims[axis] = 0;
-    }
-    std::vector<int64_t> new_shape;
-    for (int i = 0; i < decreased_dims.size(); ++i) {
-      if (decreased_dims[i] != 0) {
-        new_shape.push_back(decreased_dims[i]);
-      }
-    }
-    // NOTE(liym27): Paddle does not support that the rank of Tensor is 0, and
-    // uses [1] instead.
-    if (new_shape.size() == 0) {
-      new_shape.push_back(1);
-    }
-    decreased_dims = framework::make_ddim(new_shape);
-  }
-  return decreased_dims;
-}
 template <typename DeviceContext, typename T>
 class SetValueKernel : public framework::OpKernel<T> {
 public:
@@ -225,8 +126,8 @@ class SetValueKernel : public framework::OpKernel<T> {
    }
    auto in_dims = in->dims();
-    CheckAndUpdateSlice(in_dims, axes, &starts, &ends, &steps);
+    CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &steps);
-    auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, steps);
+    auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, &steps);
    auto decrease_slice_dims = GetDecreasedDims(slice_dims, decrease_axes);
    auto place = ctx.GetPlace();

--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -28,13 +28,10 @@ class SliceOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true,
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "slice");
-                      platform::errors::InvalidArgument(
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "slice");
-                          "Input (Input) of slice op should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
+    // Case 1: Special treatment when input is a tensor array.
-                      platform::errors::InvalidArgument(
-                          "Output (Out) of slice op should not be null."));
    auto x_var_type = ctx->GetInputsVarType("Input")[0];
    auto axes = ctx->Attrs().Get<std::vector<int>>("axes");
    if (x_var_type == framework::proto::VarType::LOD_TENSOR_ARRAY) {
@@ -57,6 +54,8 @@ class SliceOp : public framework::OperatorWithKernel {
        return;
      }
    }
+    // Case 2: input is a tensor.
    auto in_dims = ctx->GetInputDim("Input");
    PADDLE_ENFORCE_LT(in_dims.size(), 7,
                      platform::errors::InvalidArgument(
@@ -65,101 +64,54 @@ class SliceOp : public framework::OperatorWithKernel {
    auto starts = ctx->Attrs().Get<std::vector<int>>("starts");
    auto ends = ctx->Attrs().Get<std::vector<int>>("ends");
-    auto infer_flags = ctx->Attrs().Get<std::vector<int>>("infer_flags");
    auto decrease_axis = ctx->Attrs().Get<std::vector<int>>("decrease_axis");
+    auto infer_flags = ctx->Attrs().Get<std::vector<int>>("infer_flags");
-    auto starts_size = starts.size();
-    auto ends_size = ends.size();
    if (infer_flags.empty()) {
      // Initialize infer_flags with 1.
      // To be compatible with other op tests in which infer_flags is not set.
      infer_flags = std::vector<int>(axes.size(), 1);
    }
+    // 2.1 Check attrs.
+    auto starts_size = starts.size();
+    auto ends_size = ends.size();
    if (ctx->HasInputs("StartsTensorList")) {
-      auto StartsTensorList = ctx->Inputs("StartsTensorList");
+      starts_size = ctx->Inputs("StartsTensorList").size();
-      PADDLE_ENFORCE_GT(StartsTensorList.size(), 0,
+      PADDLE_ENFORCE_GT(starts_size, 0,
                        platform::errors::InvalidArgument(
                            "StartsTensorList size can't be zero"));
-      starts_size = StartsTensorList.size();
    }
    if (ctx->HasInputs("EndsTensorList")) {
-      auto EndsTensorList = ctx->Inputs("EndsTensorList");
+      ends_size = ctx->Inputs("EndsTensorList").size();
-      PADDLE_ENFORCE_GT(EndsTensorList.size(), 0,
+      PADDLE_ENFORCE_GT(ends_size, 0, platform::errors::InvalidArgument(
-                        platform::errors::InvalidArgument(
+                                          "EndsTensorList size can't be zero"));
-                            "EndsTensorList size can't be zero"));
-      ends_size = EndsTensorList.size();
    }
-    if (ctx->HasInput("StartsTensor") == false) {
+    if (!ctx->HasInput("StartsTensor")) {
      PADDLE_ENFORCE_EQ(
          starts_size, axes.size(),
          platform::errors::InvalidArgument(
              "The size of starts must be equal to the size of axes."));
    }
-    if (ctx->HasInput("EndsTensor") == false) {
+    if (!ctx->HasInput("EndsTensor")) {
      PADDLE_ENFORCE_EQ(
          ends_size, axes.size(),
          platform::errors::InvalidArgument(
              "The size of ends must be equal to the size of axes."));
    }
-    int dim_value, start, end;
+    CheckAndUpdateSliceAttrs<int>(in_dims, axes, &starts, &ends, nullptr,
-    for (size_t i = 0; i < axes.size(); ++i) {
+                                  &infer_flags);
-      PADDLE_ENFORCE_LT(static_cast<int>(axes[i]), in_dims.size(),
-                        platform::errors::InvalidArgument(
-                            "The index of dimension in axes must be less "
-                            "than the size of input shape."));
-      if (infer_flags[i] == -1) {
-        out_dims[axes[i]] = -1;
-      } else {
-        // infer out_dim shape
-        dim_value = out_dims[axes[i]];
-        if (dim_value > 0) {
-          start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
-          end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
-          start = std::max(start, 0);
-          end = std::max(end, 0);
-          end = std::min(end, dim_value);
-          PADDLE_ENFORCE_LE(start, dim_value,
-                            platform::errors::InvalidArgument(
-                                "start should be less than or equal to the "
-                                "dimension value, but received "
-                                "start = %d, shape[%d] = %d.",
-                                starts[i], axes[i], out_dims[axes[i]]));
-          PADDLE_ENFORCE_GT(end, start,
-                            platform::errors::InvalidArgument(
-                                "end should greater than start, but received "
-                                "end = %d, start = %d.",
-                                ends[i], starts[i]));
-          out_dims[axes[i]] = end - start;
-        }
-      }
-    }
-    // generate new shape
-    if (decrease_axis.size() > 0) {
-      std::vector<int> new_out_shape;
-      for (size_t i = 0; i < decrease_axis.size(); ++i) {
-        if (ctx->IsRuntime() && infer_flags[i] != -1) {
-          PADDLE_ENFORCE_EQ(
-              out_dims[decrease_axis[i]], 1,
-              platform::errors::InvalidArgument("decrease dim should be 1"));
-        }
-        out_dims[decrease_axis[i]] = 0;
-      }
-      for (int i = 0; i < out_dims.size(); ++i) {
+    auto slice_dims =
-        if (out_dims[i] != 0) {
+        GetSliceDims<int>(in_dims, axes, starts, ends, nullptr, &infer_flags);
-          new_out_shape.push_back(out_dims[i]);
+    if (ctx->IsRuntime()) {
-        }
+      out_dims = GetDecreasedDims<int>(slice_dims, decrease_axis, &infer_flags);
-      }
+    } else {
-      if (new_out_shape.size() == 0) {
+      out_dims = GetDecreasedDims<int>(slice_dims, decrease_axis, nullptr);
-        new_out_shape.push_back(1);
-      }
-      out_dims = framework::make_ddim(new_out_shape);
    }
    ctx->SetOutputDim("Out", out_dims);
    if (axes[0] != 0) {
      ctx->ShareLoD("Input", /*->*/ "Out");
@@ -185,6 +137,7 @@ class SliceOp : public framework::OperatorWithKernel {
    return framework::OpKernelType(
        OperatorWithKernel::IndicateVarDataType(ctx, "Input"), ctx.GetPlace());
  }
  framework::OpKernelType GetKernelTypeForVar(
      const std::string &var_name, const Tensor &tensor,
      const framework::OpKernelType &expected_kernel_type) const override {

--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
--- a/paddle/fluid/operators/slice_utils.h
+++ b/paddle/fluid/operators/slice_utils.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <paddle/fluid/framework/operator.h>
+#include <string>
+#include <vector>
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T = int64_t>
+inline void CheckAndUpdateSliceAttrs(const framework::DDim in_dims,
+                                     const std::vector<T>& axes,
+                                     std::vector<T>* starts,
+                                     std::vector<T>* ends,
+                                     std::vector<int64_t>* steps = nullptr,
+                                     std::vector<T>* infer_flags = nullptr) {
+  for (size_t i = 0; i < axes.size(); ++i) {
+    T axis = axes[i];
+    T dim_value = in_dims[axis];
+    if (dim_value > 0) {
+      if (infer_flags != nullptr && (*infer_flags)[i] == -1) {
+        continue;
+      }
+      T start = (*starts)[i] < 0 ? ((*starts)[i] + dim_value) : (*starts)[i];
+      start = std::max(start, static_cast<T>(0));
+      T end = (*ends)[i] < 0 ? ((*ends)[i] + dim_value) : (*ends)[i];
+      end = std::min(end, dim_value);
+      T step = steps == nullptr ? 1 : (*steps)[i];
+      PADDLE_ENFORCE_NE(
+          step, 0, platform::errors::InvalidArgument(
+                       "Step should not be 0, but received step = %d.", step));
+      if (step > 0) {
+        start = std::min(start, dim_value);
+        end = std::max(end, static_cast<T>(0));
+        PADDLE_ENFORCE_GT(
+            end, start,
+            platform::errors::InvalidArgument(
+                "When step > 0, end should be greater than start, but "
+                "received end = %d, start = %d.",
+                end, start));
+      } else {
+        // NOTE(liym27): When step < 0, start should less and equal to
+        // dim_value-1
+        // "end is -1" means contain the 0-th element of this axis.
+        start = std::min(start, dim_value - 1);
+        end = std::max(end, static_cast<T>(-1));
+        PADDLE_ENFORCE_GT(
+            start, end,
+            platform::errors::InvalidArgument(
+                "When step < 0, start should be greater than end, but "
+                "received start = %d, end = %d.",
+                start, end));
+      }
+      (*starts)[i] = start;
+      (*ends)[i] = end;
+    }
+  }
+}
+template <typename T = int64_t>
+inline framework::DDim GetSliceDims(const framework::DDim in_dims,
+                                    const std::vector<T>& axes,
+                                    const std::vector<T>& starts,
+                                    const std::vector<T>& ends,
+                                    std::vector<T>* steps = nullptr,
+                                    std::vector<T>* infer_flags = nullptr) {
+  framework::DDim slice_dims(in_dims);
+  for (size_t i = 0; i < axes.size(); ++i) {
+    T axis = axes[i];
+    if (infer_flags != nullptr && (*infer_flags)[i] == -1) {
+      slice_dims[axis] = -1;
+      continue;
+    }
+    T start = starts[i];
+    T end = ends[i];
+    T step = steps == nullptr ? 1 : (*steps)[i];
+    if (step > 0) {
+      slice_dims[axis] = (end - start + step - 1) / step;
+    } else {
+      slice_dims[axis] = (end - start + step + 1) / step;
+    }
+  }
+  return slice_dims;
+}
+template <typename T = int64_t>
+inline framework::DDim GetDecreasedDims(const framework::DDim slice_dims,
+                                        const std::vector<T>& decrease_axes,
+                                        std::vector<T>* infer_flags = nullptr) {
+  framework::DDim decreased_dims(slice_dims);
+  if (decrease_axes.size() > 0) {
+    for (size_t i = 0; i < decrease_axes.size(); ++i) {
+      T axis = decrease_axes[i];
+      if (infer_flags && (*infer_flags)[i] != -1) {
+        PADDLE_ENFORCE_EQ(
+            decreased_dims[axis], 1,
+            platform::errors::InvalidArgument("decrease dim should be 1"));
+      }
+      decreased_dims[axis] = 0;
+    }
+    std::vector<T> new_shape;
+    for (int i = 0; i < decreased_dims.size(); ++i) {
+      if (decreased_dims[i] != 0) {
+        new_shape.push_back(decreased_dims[i]);
+      }
+    }
+    // NOTE(liym27): Paddle does not support that the rank of Tensor is 0, and
+    // uses [1] instead.
+    if (new_shape.size() == 0) {
+      new_shape.push_back(1);
+    }
+    decreased_dims = framework::make_ddim(new_shape);
+  }
+  return decreased_dims;
+}
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/strided_slice_op.cc
+++ b/paddle/fluid/operators/strided_slice_op.cc
@@ -324,6 +324,7 @@ REGISTER_OPERATOR(strided_slice_grad, ops::StridedSliceOpGrad,
 REGISTER_OP_CPU_KERNEL(
    strided_slice,
+    ops::StridedSliceKernel<paddle::platform::CPUDeviceContext, bool>,
    ops::StridedSliceKernel<paddle::platform::CPUDeviceContext, int>,
    ops::StridedSliceKernel<paddle::platform::CPUDeviceContext, int64_t>,
    ops::StridedSliceKernel<paddle::platform::CPUDeviceContext, float>,
@@ -335,6 +336,7 @@ REGISTER_OP_CPU_KERNEL(
 REGISTER_OP_CPU_KERNEL(
    strided_slice_grad,
+    ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, bool>,
    ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, int>,
    ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
    ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, float>,

--- a/paddle/fluid/operators/strided_slice_op.cu
+++ b/paddle/fluid/operators/strided_slice_op.cu
@@ -18,6 +18,7 @@ limitations under the License. */
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
    strided_slice,
+    ops::StridedSliceKernel<paddle::platform::CUDADeviceContext, bool>,
    ops::StridedSliceKernel<paddle::platform::CUDADeviceContext, int>,
    ops::StridedSliceKernel<paddle::platform::CUDADeviceContext, int64_t>,
    ops::StridedSliceKernel<paddle::platform::CUDADeviceContext, float>,
@@ -29,7 +30,8 @@ REGISTER_OP_CUDA_KERNEL(
 REGISTER_OP_CUDA_KERNEL(
    strided_slice_grad,
-    ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, bool>,
+    ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, int>,
    ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
    ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, float>,
    ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, double>,

--- a/paddle/fluid/operators/tril_triu_op.cc
+++ b/paddle/fluid/operators/tril_triu_op.cc
@@ -105,13 +105,15 @@ REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker,
                  ops::TrilTriuGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(tril_triu_grad, ops::TrilTriuGradOp);
 REGISTER_OP_CPU_KERNEL(
-    tril_triu, ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, float>,
+    tril_triu, ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, bool>,
+    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, float>,
    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, double>,
    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int>,
    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, plat::float16>);
 REGISTER_OP_CPU_KERNEL(
    tril_triu_grad,
+    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, bool>,
    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, float>,
    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, double>,
    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, int>,

--- a/paddle/fluid/operators/tril_triu_op.cu
+++ b/paddle/fluid/operators/tril_triu_op.cu
@@ -18,7 +18,7 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
-    tril_triu,
+    tril_triu, ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, bool>,
    ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, float>,
    ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, double>,
    ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, int>,
@@ -26,6 +26,7 @@ REGISTER_OP_CUDA_KERNEL(
    ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
    tril_triu_grad,
+    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, bool>,
    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, float>,
    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, double>,
    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, int>,

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -563,7 +563,7 @@ Place CUDAPinnedDeviceContext::GetPlace() const { return place_; }
 MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
    : CPUDeviceContext(place), p_blobmap_() {
  p_blobmap_.reset(new BlobMap());
-  p_exec_items_.reset(new ExecMap());
+  p_exec_items_.reset(new ExecShape());
  p_mutex_.reset(new std::mutex());
 }
@@ -644,10 +644,15 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
    if (ptr == nullptr) {
      p_blobmap_->clear();
    } else {
-      for (auto& v : (*p_exec_items_)[ptr]) {
+      // Iterate through all shapes and release
-        (v.first)->erase(v.second);
+      // for each shape and active executor all entries
+      // of this executor
+      for (auto& s : *p_exec_items_) {
+        for (auto& v : (*s.second)[ptr]) {
+          (v.first)->erase(v.second);
+        }
+        s.second->erase(ptr);
      }
-      p_exec_items_->erase(ptr);
    }
  } else {
    VLOG(3) << "Prevented Clearing DNNL cache.";
@@ -655,11 +660,24 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
  }
 }
+void MKLDNNDeviceContext::RemoveShapeEntriesWithExecutor(void) const {
+  p_exec_items_->erase(p_exec_items_->begin());
+}
 void MKLDNNDeviceContext::LinkEntryWithExecutor(BlobPtr_t<KeyBlob> pblob,
                                                KeyBlob::iterator it) const {
+  // Take current input shape from TLS
  // Take current executor addess from TLS
  // and for this executor's items add the one defined with arguments
-  (*p_exec_items_)[tls().get_curr_exec()].push_back(std::make_pair(pblob, it));
+  auto key_it = p_exec_items_
+                    ->insert(std::make_pair(tls().cur_input_shape_str,
+                                            std::make_shared<ExecMap>()))
+                    .first;
+  (*key_it->second)[tls().get_curr_exec()].push_back(std::make_pair(pblob, it));
+  VLOG(3) << "LinkEntryWithExecutor, shapes: " << p_exec_items_->size()
+          << " curr exec size: "
+          << (*key_it->second)[tls().get_curr_exec()].size() << "\n";
 }
 void MKLDNNDeviceContext::BlockNextCacheClearing() {
@@ -716,6 +734,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
      VLOG(2) << "sid=" << sid
              << ", remove all blobs of shape: " << sBlob->begin()->first;
      sBlob->erase(sBlob->begin()->first);
+      RemoveShapeEntriesWithExecutor();
    }
    pBlob = std::make_shared<KeyBlob>();
    (*sBlob)[tls().cur_input_shape_str] = pBlob;
@@ -739,7 +758,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
  return;
 }
-unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) {
+unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) const {
  unsigned int num_entries = 0;
  for (auto const& l3 : *p_blobmap_) {
    for (auto const& l2 : *(l3.second)) {

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -749,8 +749,14 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
  using ShapeBlob = umap_key_string_t<KeyBlob>;
  using BlobMap = umap_value_smart_t<int, ShapeBlob>;
-  using ExecMap = std::unordered_map<
+  // Auxillary two-level structure (shape, executor) to easier control
-      void*, std::vector<std::pair<BlobPtr_t<KeyBlob>, KeyBlob::iterator>>>;
+  // clearing cache objects related to specific executor
+  using ExecKey = void*;
+  using ExecMapCacheIterPair = std::pair<BlobPtr_t<KeyBlob>, KeyBlob::iterator>;
+  using ExecMap =
+      std::unordered_map<ExecKey, std::vector<ExecMapCacheIterPair>>;
+  using ExecShape = std::unordered_map<std::string, std::shared_ptr<ExecMap>>;
  explicit MKLDNNDeviceContext(CPUPlace place);
@@ -759,6 +765,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
  // Register object to currently used executor's map
  void LinkEntryWithExecutor(BlobPtr_t<KeyBlob>, KeyBlob::iterator) const;
+  void RemoveShapeEntriesWithExecutor(void) const;
  // Remove all entries from the blob map
  void ResetBlobMap(void* ptr);
@@ -773,7 +780,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
  void SetBlob(const std::string& name, std::shared_ptr<void> data) const;
  // Calculate number of oneDNN objects cached
-  unsigned int GetCachedObjectsNumber(void);
+  unsigned int GetCachedObjectsNumber(void) const;
  // Find a saved blob. Return nullptr if not found
  std::shared_ptr<void> GetBlob(const std::string& name) const;
@@ -786,7 +793,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
  std::shared_ptr<BlobMap> p_blobmap_;
  // Map key is pointer of executor and value is a data(iterator in map) needed
  // to erase
-  std::shared_ptr<ExecMap> p_exec_items_;
+  std::shared_ptr<ExecShape> p_exec_items_;
  std::shared_ptr<std::mutex> p_mutex_;
  bool block_next_cache_clearing_ = false;
 };

--- a/paddle/fluid/platform/device_memory_aligment.cc
+++ b/paddle/fluid/platform/device_memory_aligment.cc
@@ -26,9 +26,11 @@ size_t Alignment(size_t size, const platform::Place &place) {
 #elif defined(PADDLE_WITH_XPU)
    // TODO(wangxi): add XpuMinChunkSize
    alignment = alignment;
+#elif defined(PADDLE_WITH_ASCEND_CL)
+    alignment = NPUMinChunkSize();
 #else
    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Fluid is not compiled with CUDA."));
+        "Fluid is not compiled with CUDA or NPU."));
 #endif
  }
  size_t remaining = size % alignment;

--- a/paddle/fluid/platform/device_memory_aligment.h
+++ b/paddle/fluid/platform/device_memory_aligment.h
@@ -19,6 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/gpu_info.h"
+#elif defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/npu_info.h"
 #endif
 namespace paddle {

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -775,13 +775,13 @@ inline std::string GetExternalErrorMsg(T status) {
      }
    }
 #else
-    char buf[100];
+    char buf[512];
    MEMORY_BASIC_INFORMATION mbi;
    HMODULE h_module =
        (::VirtualQuery(GetCurrentTraceBackString, &mbi, sizeof(mbi)) != 0)
            ? (HMODULE)mbi.AllocationBase
            : NULL;
-    GetModuleFileName(h_module, buf, 100);
+    GetModuleFileName(h_module, buf, 512);
    std::string strModule(buf);
    const size_t last_slash_idx = strModule.find_last_of("\\");
    std::string compare_path = strModule.substr(strModule.length() - 7);

--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -417,7 +417,7 @@ TEST(enforce, cuda_success) {
      "An unsupported value or parameter was passed to the function (a "
      "negative vector size, for example).To correct: ensure that all the "
      "parameters being passed have valid values"));
-  /*
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
  EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
  EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "NCCL error"));
@@ -430,7 +430,6 @@ TEST(enforce, cuda_success) {
                                     "The call to NCCL is incorrect. This is "
                                     "usually reflecting a programming error"));
 #endif
-*/
 }
 #endif
 #endif

--- a/paddle/fluid/platform/gpu_launch_config.h
+++ b/paddle/fluid/platform/gpu_launch_config.h
@@ -37,6 +37,7 @@ struct GpuLaunchConfig {
  dim3 theory_thread_count = dim3(1, 1, 1);
  dim3 thread_per_block = dim3(1, 1, 1);
  dim3 block_per_grid = dim3(1, 1, 1);
+  int compute_capability = 0;
 };
 inline GpuLaunchConfig GetGpuLaunchConfig1D(
@@ -67,11 +68,14 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(
      std::min(max_threads, context.GetMaxThreadsPerBlock());
  const int block_count =
      std::min(DivUp(physical_thread_count, thread_per_block), sm);
+  // Get compute_capability
+  const int capability = context.GetComputeCapability();
  GpuLaunchConfig config;
  config.theory_thread_count.x = theory_thread_count;
  config.thread_per_block.x = thread_per_block;
  config.block_per_grid.x = block_count;
+  config.compute_capability = capability;
  return config;
 }

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -56,6 +56,7 @@ set(PYBIND_SRCS
  ir.cc
  inference_api.cc
  compatible.cc
+  io.cc
  generator_py.cc)
 if(WITH_ASCEND)

--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -511,6 +511,7 @@ void BindAnalysisConfig(py::module *m) {
           py::arg("disable_trt_plugin_fp16") = false)
      .def("enable_tensorrt_oss", &AnalysisConfig::EnableTensorRtOSS)
      .def("tensorrt_oss_enabled", &AnalysisConfig::tensorrt_oss_enabled)
+      .def("exp_disable_tensorrt_ops", &AnalysisConfig::Exp_DisableTensorRtOPs)
      .def("enable_tensorrt_dla", &AnalysisConfig::EnableTensorRtDLA,
           py::arg("dla_core") = 0)
      .def("tensorrt_dla_enabled", &AnalysisConfig::tensorrt_dla_enabled)

--- a/paddle/fluid/pybind/io.cc
+++ b/paddle/fluid/pybind/io.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/pybind/io.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/pybind/pybind_boost_headers.h"
+namespace py = pybind11;
+namespace paddle {
+namespace pybind {
+void BindIO(pybind11::module *m) {
+  m->def("save_lod_tensor", [](const paddle::framework::LoDTensor &tensor,
+                               const std::string &str_file_name) {
+    std::ofstream fout(str_file_name, std::ios::binary);
+    PADDLE_ENFORCE_EQ(static_cast<bool>(fout), true,
+                      platform::errors::Unavailable(
+                          "Cannot open %s to save variables.", str_file_name));
+    paddle::framework::SerializeToStream(fout, tensor);
+    int64_t tellp = fout.tellp();
+    fout.close();
+    return tellp;
+  });
+  m->def("load_lod_tensor", [](paddle::framework::LoDTensor &tensor,
+                               const std::string &str_file_name) {
+    std::ifstream fin(str_file_name, std::ios::binary);
+    PADDLE_ENFORCE_EQ(static_cast<bool>(fin), true,
+                      platform::errors::Unavailable(
+                          "Cannot open %s to load variables.", str_file_name));
+    paddle::framework::DeserializeFromStream(fin, &tensor);
+    int64_t tellg = fin.tellg();
+    fin.close();
+    return tellg;
+  });
+  m->def("save_selected_rows",
+         [](const paddle::framework::SelectedRows &selected_rows,
+            const std::string &str_file_name) {
+           std::ofstream fout(str_file_name, std::ios::binary);
+           PADDLE_ENFORCE_EQ(
+               static_cast<bool>(fout), true,
+               platform::errors::Unavailable(
+                   "Cannot open %s to save SelectedRows.", str_file_name));
+           paddle::framework::SerializeToStream(fout, selected_rows);
+           int64_t tellp = fout.tellp();
+           fout.close();
+           return tellp;
+         });
+  m->def("load_selected_rows",
+         [](paddle::framework::SelectedRows &selected_rows,
+            const std::string &str_file_name) {
+           std::ifstream fin(str_file_name, std::ios::binary);
+           PADDLE_ENFORCE_EQ(
+               static_cast<bool>(fin), true,
+               platform::errors::Unavailable(
+                   "Cannot open %s to load SelectedRows.", str_file_name));
+           paddle::framework::DeserializeFromStream(fin, &selected_rows);
+           int64_t tellg = fin.tellg();
+           fin.close();
+           return tellg;
+         });
+  m->def("save_lod_tensor_to_memory",
+         [](const paddle::framework::LoDTensor &tensor) -> py::bytes {
+           std::ostringstream ss;
+           paddle::framework::SerializeToStream(ss, tensor);
+           return ss.str();
+         });
+  m->def("load_lod_tensor_from_memory", [](paddle::framework::LoDTensor &tensor,
+                                           const std::string &tensor_bytes) {
+    std::istringstream fin(tensor_bytes, std::ios::in | std::ios::binary);
+    paddle::framework::DeserializeFromStream(fin, &tensor);
+  });
+  m->def("save_selected_rows_to_memory",
+         [](const paddle::framework::SelectedRows &selected_rows) -> py::bytes {
+           std::ostringstream ss;
+           paddle::framework::SerializeToStream(ss, selected_rows);
+           return ss.str();
+         });
+  m->def("load_selected_rows_from_memory",
+         [](paddle::framework::SelectedRows &selected_rows,
+            const std::string &selected_rows_bytes) {
+           std::istringstream fin(selected_rows_bytes,
+                                  std::ios::in | std::ios::binary);
+           paddle::framework::DeserializeFromStream(fin, &selected_rows);
+         });
+}
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/fluid/pybind/io.h
+++ b/paddle/fluid/pybind/io.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <Python.h>
+#include "paddle/fluid/pybind/pybind_boost_headers.h"
+namespace paddle {
+namespace pybind {
+void BindIO(pybind11::module* m);
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -65,6 +65,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
    {"box_coder", {"PriorBox", "PriorBoxVar", "TargetBox"}},
    {"momentum", {"Param", "Grad", "Velocity", "LearningRate"}},
    {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}},
+    {"run_program", {"X", "Params"}},
 };
 // NOTE(zhiqiu): Like op_ins_map.
@@ -98,6 +99,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
    {"rnn", {"DropoutState", "Reserve", "Out", "State"}},
    {"lamb",
     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
+    {"run_program", {"DOut"}},
 };
 // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
@@ -148,6 +150,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
    {"lamb",
     {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
    {"rnn", {"DropoutState"}},
+    {"run_program", {"Out", "DOut", "OutScope"}},
 };
 // NOTE(pangyoki): Tensor View Strategy.
@@ -173,7 +176,7 @@ std::set<std::string> inplace_op_duplicable_ins_set = {
 // clang-format off
 const char* OUT_INITIALIZER_TEMPLATE =
-    R"({"%s", {std::shared_ptr<imperative::VarBase>(new imperative::VarBase(tracer->GenerateUniqueName()))}})";
+    R"({"%s", {std::shared_ptr<imperative::VarBase>(new imperative::VarBase("auto_"+std::to_string(VarBaseUniqueNameID++)+"_"))}})";
 const char* OUT_DUPLICABLE_INITIALIZER_TEMPLATE = R"({"%s", ConstructDuplicableOutput(%s)})";
 const char* INPUT_INITIALIZER_TEMPLATE = R"({"%s", {%s}})";
@@ -255,12 +258,11 @@ R"(
  ConstructAttrMapFromPyArgs("%s", %d, &attrs, args);
  {
    py::gil_scoped_release release;
-    auto tracer = imperative::GetCurrentTracer();
    %s
    imperative::NameVarBaseMap outs = %s;
    imperative::NameVarBaseMap ins = %s;
    %s
-    tracer->TraceOp("%s", ins, outs, attrs, {%s});
+    imperative::GetCurrentTracer()->TraceOp("%s", ins, outs, attrs, {%s});
    return %s;
  }
 })";
@@ -585,7 +587,8 @@ int main(int argc, char* argv[]) {
  out << "namespace py = pybind11;"
      << "\n";
  out << "namespace paddle {\n"
-      << "namespace pybind {\n";
+      << "namespace pybind {\n\n";
+  out << "std::atomic<int> VarBaseUniqueNameID{0};\n";
  out << paddle::string::join_strings(std::get<0>(op_funcs), '\n');
  out << "\n\n";

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -69,6 +69,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/monitor.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/pybind/io.h"
 #ifdef PADDLE_WITH_ASCEND
 #include "paddle/fluid/pybind/ascend_wrapper_py.h"
 #endif
@@ -498,70 +499,6 @@ PYBIND11_MODULE(core_noavx, m) {
 #endif
    return tensor;
  });
-  m.def("_save_lod_tensor", [](const LoDTensor &tensor,
-                               const std::string &str_file_name) {
-    std::ofstream fout(str_file_name, std::ios::binary);
-    PADDLE_ENFORCE_EQ(static_cast<bool>(fout), true,
-                      platform::errors::Unavailable(
-                          "Cannot open %s to save variables.", str_file_name));
-    SerializeToStream(fout, tensor);
-    int64_t tellp = fout.tellp();
-    fout.close();
-    return tellp;
-  });
-  m.def("_load_lod_tensor", [](LoDTensor &tensor,
-                               const std::string &str_file_name) {
-    std::ifstream fin(str_file_name, std::ios::binary);
-    PADDLE_ENFORCE_EQ(static_cast<bool>(fin), true,
-                      platform::errors::Unavailable(
-                          "Cannot open %s to load variables.", str_file_name));
-    DeserializeFromStream(fin, &tensor);
-    int64_t tellg = fin.tellg();
-    fin.close();
-    return tellg;
-  });
-  m.def("_save_selected_rows", [](const SelectedRows &selected_rows,
-                                  const std::string &str_file_name) {
-    std::ofstream fout(str_file_name, std::ios::binary);
-    PADDLE_ENFORCE_EQ(
-        static_cast<bool>(fout), true,
-        platform::errors::Unavailable("Cannot open %s to save SelectedRows.",
-                                      str_file_name));
-    SerializeToStream(fout, selected_rows);
-    int64_t tellp = fout.tellp();
-    fout.close();
-    return tellp;
-  });
-  m.def("_load_selected_rows",
-        [](SelectedRows &selected_rows, const std::string &str_file_name) {
-          std::ifstream fin(str_file_name, std::ios::binary);
-          PADDLE_ENFORCE_EQ(
-              static_cast<bool>(fin), true,
-              platform::errors::Unavailable(
-                  "Cannot open %s to load SelectedRows.", str_file_name));
-          DeserializeFromStream(fin, &selected_rows);
-          int64_t tellg = fin.tellg();
-          fin.close();
-          return tellg;
-        });
-  m.def("_save_static_dict",
-        [](const std::string &str_file_name, const py::handle &vec_var_list,
-           const Scope &scope) {
-          std::vector<std::string> vec_name_list = GetNameList(vec_var_list);
-          SaveStaticNameListToDisk(str_file_name, vec_name_list, scope);
-        });
-  m.def("_load_static_dict",
-        [](const std::string &str_file_name, const py::handle &vec_var_list,
-           const Scope &scope, const Executor *executor) {
-          std::vector<std::string> vec_name_list = GetNameList(vec_var_list);
-          CreateVariableIfNotExit(vec_var_list, scope, executor);
-          LoadStaticNameListFromDisk(str_file_name, vec_name_list, scope);
-        });
  m.def("_create_loaded_parameter",
        [](const py::handle &vec_var_list, const Scope &scope,
@@ -569,26 +506,6 @@ PYBIND11_MODULE(core_noavx, m) {
          CreateVariableIfNotExit(vec_var_list, scope, executor);
        });
-  m.def("_save_dygraph_dict", [](const std::string &str_file_name,
-                                 const PyNameVarBaseMap &state_dict) {
-    auto vec_var_base_list = GetVarBaseList(state_dict);
-    SaveDygraphVarBaseListToDisk(str_file_name, vec_var_base_list);
-  });
-  m.def("_load_dygraph_dict", [](const std::string &str_file_name) {
-    auto load_tensor = LoadDygraphVarBaseListFromDisk(str_file_name);
-    std::unordered_map<std::string, std::shared_ptr<imperative::VarBase>>
-        map_output;
-    for (size_t i = 0; i < load_tensor.size(); ++i) {
-      map_output.emplace(load_tensor[i]->Name(), load_tensor[i]);
-    }
-    return map_output;
-  });
  m.def("save_op_version_info", [](framework::ProgramDesc &desc) {
    framework::compatible::pb::OpVersionMap pb_vmap{desc.OpVersionMap()};
    framework::compatible::SaveOpVersions(
@@ -3154,6 +3071,7 @@ All parameter, weight, gradient are variables in Paddle.
      .def("device_count", &ParallelExecutor::DeviceCount);
  BindFleetWrapper(&m);
+  BindIO(&m);
 #ifdef PADDLE_WITH_PSLIB
  BindHeterWrapper(&m);

--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -22,11 +22,12 @@ setlocal enabledelayedexpansion
 rem -------clean up environment-----------
 set work_dir=%cd%
-set cache_dir=%work_dir:Paddle=cache%
+if not defined cache_dir set cache_dir=%work_dir:Paddle=cache%
 if not exist %cache_dir%\tools (
    git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools
 )
 taskkill /f /im cmake.exe  2>NUL
+taskkill /f /im ninja.exe  2>NUL
 taskkill /f /im MSBuild.exe 2>NUL
 taskkill /f /im cl.exe 2>NUL
 taskkill /f /im lib.exe 2>NUL
@@ -217,7 +218,8 @@ set CUDA_ARCH_NAME=All
 call :cmake || goto cmake_error
 call :build || goto build_error
-call :zip_file || goto zip_file_error
+call :zip_cc_file || goto zip_cc_file_error
+call :zip_c_file || goto zip_c_file_error
 goto:success
 rem "Other configurations are added here"
@@ -689,7 +691,7 @@ goto:eof
 exit /b 1
 rem ---------------------------------------------------------------------------------------------
-:zip_file
+:zip_cc_file
 tree /F %cd%\paddle_inference_install_dir\paddle
 if exist paddle_inference.zip del paddle_inference.zip
 python -c "import shutil;shutil.make_archive('paddle_inference', 'zip', root_dir='paddle_inference_install_dir')"
@@ -701,10 +703,27 @@ for /F %%i in ("%libsize%") do (
 )
 goto:eof
-:zip_file_error
+:zip_cc_file_error
 echo Tar inference library failed!
 exit /b 1
+rem ---------------------------------------------------------------------------------------------
+:zip_c_file
+tree /F %cd%\paddle_inference_c_install_dir\paddle
+if exist paddle_inference_c.zip del paddle_inference_c.zip
+python -c "import shutil;shutil.make_archive('paddle_inference_c', 'zip', root_dir='paddle_inference_c_install_dir')"
+%cache_dir%\tools\busybox64.exe du -h -k paddle_inference_c.zip > lib_size.txt
+set /p libsize=< lib_size.txt
+for /F %%i in ("%libsize%") do (
+    set /a libsize_m=%%i/1024
+    echo "Windows Paddle_Inference CAPI ZIP Size: !libsize_m!M"
+)
+goto:eof
+:zip_c_file_error
+echo Tar inference capi library failed!
+exit /b 1
 :timestamp
 setlocal enabledelayedexpansion
 @ECHO OFF
@@ -763,6 +782,7 @@ echo    ========================================
 echo    Clean up environment  at the end ...
 echo    ========================================
 taskkill /f /im cmake.exe  2>NUL
+taskkill /f /im ninja.exe  2>NUL
 taskkill /f /im MSBuild.exe 2>NUL
 taskkill /f /im git.exe 2>NUL
 taskkill /f /im cl.exe 2>NUL

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -426,6 +426,13 @@ EOF
        buildSize=$(du -h --max-depth=0 ${PADDLE_ROOT}/build/paddle_inference.tgz |awk '{print $1}')
        echo "Paddle_Inference Size: $buildSize"
        echo "ipipe_log_param_Paddle_Inference_Size: $buildSize" >> ${PADDLE_ROOT}/build/build_summary.txt
+    elif [ "$1" == "paddle_inference_c" ]; then
+        cd ${PADDLE_ROOT}/build
+        cp -r paddle_inference_c_install_dir paddle_inference_c
+        tar -czf paddle_inference_c.tgz paddle_inference_c
+        buildSize=$(du -h --max-depth=0 ${PADDLE_ROOT}/build/paddle_inference_c.tgz |awk '{print $1}')
+        echo "Paddle_Inference Capi Size: $buildSize"
+        echo "ipipe_log_param_Paddle_Inference_capi_Size: $buildSize" >> ${PADDLE_ROOT}/build/build_summary.txt
    else
        SYSTEM=`uname -s`
        if [ "$SYSTEM" == "Darwin" ]; then
@@ -1941,6 +1948,7 @@ EOF
    echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
    build_size "paddle_inference"
+    build_size "paddle_inference_c"
 }
 function tar_fluid_lib() {
@@ -2001,12 +2009,16 @@ function build_document_preview() {
    sh /paddle/tools/document_preview.sh ${PORT}
 }
+# origin name: example
-function example() {
+function exec_samplecode_test() {
    pip install ${PADDLE_ROOT}/build/python/dist/*.whl
    paddle version
    cd ${PADDLE_ROOT}/tools
-    python sampcd_processor.py cpu;example_error=$?
+    if [ "$1" = "cpu" ] ; then
+        python sampcd_processor.py cpu; example_error=$?
+    elif [ "$1" = "gpu" ] ; then
+        python sampcd_processor.py --threads=16 --full-test gpu; example_error=$?
+    fi
    if [ "$example_error" != "0" ];then
      echo "Code instance execution failed" >&2
      exit 5
@@ -2119,7 +2131,7 @@ function main() {
        check_sequence_op_unittest
        generate_api_spec ${PYTHON_ABI:-""} "PR"
        set +e
-        example_info=$(example)
+        example_info=$(exec_samplecode_test cpu)
        example_code=$?
        summary_check_problems $check_style_code $example_code "$check_style_info" "$example_info"
        assert_api_spec_approvals
@@ -2278,7 +2290,11 @@ function main() {
        build_document_preview
        ;;
      api_example)
-        example
+        example_info=$(exec_samplecode_test cpu)
+        example_code=$?
+        check_style_code=0
+        check_style_info=
+        summary_check_problems $check_style_code $example_code "$check_style_info" "$example_info"
        ;;
      test_op_benchmark)
        test_op_benchmark

--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
--- a/python/paddle/amp/__init__.py
+++ b/python/paddle/amp/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .auto_cast import auto_cast
+from .auto_cast import auto_cast  # noqa: F401
-from .grad_scaler import GradScaler
+from .grad_scaler import GradScaler  # noqa: F401
 __all__ = ['auto_cast', 'GradScaler']
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -14,7 +14,7 @@
 from paddle.fluid.dygraph.amp import amp_guard
-__all__ = ['auto_cast']
+__all__ = []
 def auto_cast(enable=True, custom_white_list=None, custom_black_list=None):

--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
@@ -14,7 +14,7 @@
 from paddle.fluid.dygraph.amp import AmpScaler
-__all__ = ['GradScaler']
+__all__ = []
 class GradScaler(AmpScaler):

--- a/python/paddle/autograd/__init__.py
+++ b/python/paddle/autograd/__init__.py
--- a/python/paddle/autograd/backward_mode.py
+++ b/python/paddle/autograd/backward_mode.py
--- a/python/paddle/autograd/py_layer.py
+++ b/python/paddle/autograd/py_layer.py
--- a/python/paddle/batch.py
+++ b/python/paddle/batch.py
--- a/python/paddle/compat.py
+++ b/python/paddle/compat.py
--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
--- a/python/paddle/device.py
+++ b/python/paddle/device.py
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
--- a/python/paddle/distributed/fleet/ascend_utils.py
+++ b/python/paddle/distributed/fleet/ascend_utils.py
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
--- a/python/paddle/distributed/fleet/meta_parallel/__init__.py
+++ b/python/paddle/distributed/fleet/meta_parallel/__init__.py
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
--- a/python/paddle/fluid/contrib/model_stat.py
+++ b/python/paddle/fluid/contrib/model_stat.py
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
--- a/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py
+++ b/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
--- a/python/paddle/fluid/contrib/sparsity/__init__.py
+++ b/python/paddle/fluid/contrib/sparsity/__init__.py
--- a/python/paddle/fluid/contrib/sparsity/asp.py
+++ b/python/paddle/fluid/contrib/sparsity/asp.py
--- a/python/paddle/fluid/contrib/sparsity/utils.py
+++ b/python/paddle/fluid/contrib/sparsity/utils.py
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
--- a/python/paddle/fluid/dataloader/collate.py
+++ b/python/paddle/fluid/dataloader/collate.py
--- a/python/paddle/fluid/dataloader/worker.py
+++ b/python/paddle/fluid/dataloader/worker.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
--- a/python/paddle/fluid/dygraph/layer_hooks.py
+++ b/python/paddle/fluid/dygraph/layer_hooks.py
--- a/python/paddle/fluid/dygraph/layer_object_helper.py
+++ b/python/paddle/fluid/dygraph/layer_object_helper.py
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/incubate/fleet/utils/hdfs.py
+++ b/python/paddle/fluid/incubate/fleet/utils/hdfs.py
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
--- a/python/paddle/fluid/net_drawer.py
+++ b/python/paddle/fluid/net_drawer.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
--- a/python/paddle/fluid/tests/test_beam_search_decoder.py
+++ b/python/paddle/fluid/tests/test_beam_search_decoder.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/asp/__init__.py
+++ b/python/paddle/fluid/tests/unittests/asp/__init__.py
--- a/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py
+++ b/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_optimize.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize.py
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_1d.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_1d.py
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_best.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_best.py
--- a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_greedy.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_greedy.py
--- a/python/paddle/fluid/tests/unittests/test_asp_utils.py
+++ b/python/paddle/fluid/tests/unittests/test_asp_utils.py
--- a/python/paddle/fluid/tests/unittests/check_nan_inf_base_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/check_nan_inf_base_dygraph.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_op_attr.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_op_attr.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
--- a/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
+++ b/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
--- a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
--- a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_use_mkldnn.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_use_mkldnn.py
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/npu/test_coalesce_tensor_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_coalesce_tensor_op_npu.py
--- a/python/paddle/fluid/tests/unittests/parallel_embedding_api.py
+++ b/python/paddle/fluid/tests/unittests/parallel_embedding_api.py
--- a/python/paddle/fluid/tests/unittests/parallel_embedding_api_none_divisible.py
+++ b/python/paddle/fluid/tests/unittests/parallel_embedding_api_none_divisible.py
--- a/python/paddle/fluid/tests/unittests/pipeline_mnist_multi_device.py
+++ b/python/paddle/fluid/tests/unittests/pipeline_mnist_multi_device.py
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
--- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
--- a/python/paddle/fluid/tests/unittests/test_collective_split_embedding_none_divisible.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_split_embedding_none_divisible.py
--- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
--- a/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py
--- a/python/paddle/fluid/tests/unittests/test_diagflat.py
+++ b/python/paddle/fluid/tests/unittests/test_diagflat.py
--- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
--- a/python/paddle/fluid/tests/unittests/test_fc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fc_op.py
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
--- a/python/paddle/fluid/tests/unittests/test_memcpy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_memcpy_op.py
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py
--- a/python/paddle/fluid/tests/unittests/test_nan_inf.py
+++ b/python/paddle/fluid/tests/unittests/test_nan_inf.py
--- a/python/paddle/fluid/tests/unittests/test_neg_op.py
+++ b/python/paddle/fluid/tests/unittests/test_neg_op.py
--- a/python/paddle/fluid/tests/unittests/test_nn_quant_functional_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_quant_functional_layers.py
--- a/python/paddle/fluid/tests/unittests/test_onnx_export.py
+++ b/python/paddle/fluid/tests/unittests/test_onnx_export.py
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
--- a/python/paddle/fluid/tests/unittests/test_pipeline.py
+++ b/python/paddle/fluid/tests/unittests/test_pipeline.py
--- a/python/paddle/fluid/tests/unittests/test_pylayer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pylayer_op.py
--- a/python/paddle/fluid/tests/unittests/test_raw_program_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_raw_program_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_regularizer.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer.py
--- a/python/paddle/fluid/tests/unittests/test_roi_align_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
--- a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
--- a/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
+++ b/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
--- a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
--- a/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py
--- a/python/paddle/fluid/transpiler/collective.py
+++ b/python/paddle/fluid/transpiler/collective.py
--- a/python/paddle/fluid/variable_index.py
+++ b/python/paddle/fluid/variable_index.py
--- a/python/paddle/framework/framework.py
+++ b/python/paddle/framework/framework.py
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
--- a/python/paddle/hapi/callbacks.py
+++ b/python/paddle/hapi/callbacks.py
--- a/python/paddle/hapi/hub.py
+++ b/python/paddle/hapi/hub.py
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
--- a/python/paddle/incubate/__init__.py
+++ b/python/paddle/incubate/__init__.py
--- a/python/paddle/incubate/checkpoint/__init__.py
+++ b/python/paddle/incubate/checkpoint/__init__.py
--- a/python/paddle/incubate/optimizer/__init__.py
+++ b/python/paddle/incubate/optimizer/__init__.py
--- a/python/paddle/incubate/optimizer/lookahead.py
+++ b/python/paddle/incubate/optimizer/lookahead.py
--- a/python/paddle/incubate/optimizer/modelaverage.py
+++ b/python/paddle/incubate/optimizer/modelaverage.py
--- a/python/paddle/inference/__init__.py
+++ b/python/paddle/inference/__init__.py
--- a/python/paddle/jit/__init__.py
+++ b/python/paddle/jit/__init__.py
--- a/python/paddle/jit/dy2static/__init__.py
+++ b/python/paddle/jit/dy2static/__init__.py
--- a/python/paddle/jit/dy2static/convert_call_func.py
+++ b/python/paddle/jit/dy2static/convert_call_func.py
--- a/python/paddle/jit/dy2static/convert_operators.py
+++ b/python/paddle/jit/dy2static/convert_operators.py
--- a/python/paddle/jit/dy2static/variable_trans_func.py
+++ b/python/paddle/jit/dy2static/variable_trans_func.py
--- a/python/paddle/metric/__init__.py
+++ b/python/paddle/metric/__init__.py
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
--- a/python/paddle/nn/quant/__init__.py
+++ b/python/paddle/nn/quant/__init__.py
--- a/python/paddle/nn/quant/functional_layers.py
+++ b/python/paddle/nn/quant/functional_layers.py
--- a/python/paddle/nn/utils/spectral_norm_hook.py
+++ b/python/paddle/nn/utils/spectral_norm_hook.py
--- a/python/paddle/onnx/__init__.py
+++ b/python/paddle/onnx/__init__.py
--- a/python/paddle/onnx/export.py
+++ b/python/paddle/onnx/export.py
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
--- a/python/paddle/static/nn/__init__.py
+++ b/python/paddle/static/nn/__init__.py
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
--- a/python/paddle/tests/test_callback_visualdl.py
+++ b/python/paddle/tests/test_callback_visualdl.py
--- a/python/paddle/tests/test_dataset_cifar.py
+++ b/python/paddle/tests/test_dataset_cifar.py
--- a/python/paddle/tests/test_download.py
+++ b/python/paddle/tests/test_download.py
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
--- a/python/paddle/text/datasets/wmt14.py
+++ b/python/paddle/text/datasets/wmt14.py
--- a/python/paddle/utils/download.py
+++ b/python/paddle/utils/download.py
--- a/python/paddle/vision/__init__.py
+++ b/python/paddle/vision/__init__.py
--- a/python/paddle/vision/datasets/__init__.py
+++ b/python/paddle/vision/datasets/__init__.py
--- a/python/paddle/vision/datasets/cifar.py
+++ b/python/paddle/vision/datasets/cifar.py
--- a/python/paddle/vision/datasets/flowers.py
+++ b/python/paddle/vision/datasets/flowers.py
--- a/python/paddle/vision/datasets/folder.py
+++ b/python/paddle/vision/datasets/folder.py
--- a/python/paddle/vision/datasets/mnist.py
+++ b/python/paddle/vision/datasets/mnist.py
--- a/python/paddle/vision/datasets/voc2012.py
+++ b/python/paddle/vision/datasets/voc2012.py
--- a/python/paddle/vision/image.py
+++ b/python/paddle/vision/image.py
--- a/python/paddle/vision/models/__init__.py
+++ b/python/paddle/vision/models/__init__.py
--- a/python/paddle/vision/models/lenet.py
+++ b/python/paddle/vision/models/lenet.py
--- a/python/paddle/vision/models/mobilenetv1.py
+++ b/python/paddle/vision/models/mobilenetv1.py
--- a/python/paddle/vision/models/mobilenetv2.py
+++ b/python/paddle/vision/models/mobilenetv2.py
--- a/python/paddle/vision/models/resnet.py
+++ b/python/paddle/vision/models/resnet.py
--- a/python/paddle/vision/models/vgg.py
+++ b/python/paddle/vision/models/vgg.py
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
--- a/python/paddle/vision/transforms/__init__.py
+++ b/python/paddle/vision/transforms/__init__.py
--- a/python/paddle/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
--- a/python/paddle/vision/transforms/functional_cv2.py
+++ b/python/paddle/vision/transforms/functional_cv2.py
--- a/python/paddle/vision/transforms/functional_pil.py
+++ b/python/paddle/vision/transforms/functional_pil.py
--- a/python/paddle/vision/transforms/functional_tensor.py
+++ b/python/paddle/vision/transforms/functional_tensor.py
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
--- a/python/setup.py.in
+++ b/python/setup.py.in
--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh